########################################################################
#Tutorial 1
########################################################################


########################################################################
#Download R for windows:
#https://cran.r-project.org/bin/windows/base/

#Download R for Mac:
#https://cran.r-project.org/bin/macosx/

#Download RStudio (requires installtion of R)
#https://www.rstudio.com/products/rstudio/download/
#the site contains installers for different operating systems


# Resources to start up with R
# An introductory R manual: https://cran.r-project.org/doc/contrib/Verzani-SimpleR.pdf
# by John Verzani
# you can use this online resource to practice
# introduction into R
# interactive software: http://tryr.codeschool.com/

########################################################################



x<-c(15, 14 ,18 ,12)#define a vector
x[4] #print the second component of x
#or
x<-scan(text="15 14 18 12")
mean(x)
median(x)


#generate a population:  N=100 units uniformely distributed in the square with side 2
a<-0 #lower limit for the generated values
b<-2 #upper limit for the generated values
N<-100 #population size
x_pop<-runif(100,a,b)
y_pop<-runif(100,a,b)
plot(x_pop,y_pop)

#for exercise use ?plot to explore the plot function



#Simple Random Sampling without replacement: generate a sample (without replacement) of size n=7 of this population
n=7
sample1<-sample(x=1:N,size=n) 
points(x_pop[sample1],y_pop[sample1],col="red",pch=19,cex=1.5)


#Now construct a population of size N=100 of real number (we generate these numbers from 
#a Normal(10,1) distribution)

z_pop<-rnorm(N,10,1)
#learn the population characteristics
(mean_z_pop<-mean(z_pop))#true population mean
(var_z_pop<-var(z_pop))#true population variance





#Now that we have the population 
#draw a sample of size n=7 from this population
i<-sample(1:N,n)
(z_sample<-z_pop[i])
(muhat<-mean(z_sample))#sample mean
(s2<-var(z_sample))#sample variance


#write a user function with inputs: sample and population size 
#The function calculates sample mean and the variance of the sample mean



mean_var<-function(smpl,N){
  n=length(smpl)
  #this line is calculating the mean of the sample
  smpl_mean=mean(smpl)
  
  term1=(N-n)/N
  term2=var(smpl)/n
  #equation (2.6) page 15
  var_smpl_mean=term1*term2
  
  out=list()
  out$mean=smpl_mean
  out$var=var_smpl_mean
  return(out)
}


true_var_ofTheMean<-function(population,n){
  N<-length(population)
  term1<-(N-n)/N
  term2<-var(population)/n
  #equation (2.5) page 15
  out<-term1*term2
  return(out)
}
#make a function call to calculate 
#mean and the variance of the mean estimator
#for a given sample
out_ls=mean_var(smpl=z_sample,N=N)
#and the true variance of the
#mean estimate using the population variance
true_var=true_var_ofTheMean(population=z_pop,n=n)



#exercise: write a function that takes the sample and population size as input and gives
#the sample mean, sample variance, estimated variance of the sample mean, estimate of the
#population total and its estimated variance


#simulation: study the distribution of the sample mean by repeating the previous procedure for b=10000 times 
#also study the effectiveness of the sampling strategy; 

#using the function true_var_ofTheMean we can compare the theoretical estimate of the variance 
#of the sample mean to the estimate
#of the variance obtained from the simulation

b<-10000

#defining an empty vector allocates memory in R thus avoiding large memory issues
#so create an empty vector of length b
#defining NA values takes a lot less memory
#using the for loop we are going to fill the empty vector
sample_mean=rep(NA,b)  

for (j in 1:b) {
  smpl<-sample(1:N,n)
  sample_mean[j]<-mean(z_pop[smpl])
}
hist(z_pop,nclass = 10)
#look at the sampling distribution of sample_mean
hist(sample_mean)
#estimate of the variance of sample_mean from the simulation
var(sample_mean)


#exercise: follow the changes in the histogram of the estimated mean and its variance
#with changes of the sample size n

#the trees dataset from the last class
trees
y<-trees$Volume
N<-length(y)
n<-10
units=1:N
s<-sample(units,n)

mean_var(y[s],N)


ybar_trees<-c()
for (j in 1:b) {
  s<-sample(z_pop,n)
  ybar_trees[j]<-mean(s)
}
hist(ybar_trees)
var(ybar_trees)
true_var_ofTheMean(z_pop,n)