######################################################################## #Tutorial 1 ######################################################################## ######################################################################## #Download R for windows: #https://cran.r-project.org/bin/windows/base/ #Download R for Mac: #https://cran.r-project.org/bin/macosx/ #Download RStudio (requires installtion of R) #https://www.rstudio.com/products/rstudio/download/ #the site contains installers for different operating systems # Resources to start up with R # An introductory R manual: https://cran.r-project.org/doc/contrib/Verzani-SimpleR.pdf # by John Verzani # you can use this online resource to practice # introduction into R # interactive software: http://tryr.codeschool.com/ ######################################################################## x<-c(15, 14 ,18 ,12)#define a vector x[4] #print the second component of x #or x<-scan(text="15 14 18 12") mean(x) median(x) #generate a population: N=100 units uniformely distributed in the square with side 2 a<-0 #lower limit for the generated values b<-2 #upper limit for the generated values N<-100 #population size x_pop<-runif(100,a,b) y_pop<-runif(100,a,b) plot(x_pop,y_pop) #for exercise use ?plot to explore the plot function #Simple Random Sampling without replacement: generate a sample (without replacement) of size n=7 of this population n=7 sample1<-sample(x=1:N,size=n) points(x_pop[sample1],y_pop[sample1],col="red",pch=19,cex=1.5) #Now construct a population of size N=100 of real number (we generate these numbers from #a Normal(10,1) distribution) z_pop<-rnorm(N,10,1) #learn the population characteristics (mean_z_pop<-mean(z_pop))#true population mean (var_z_pop<-var(z_pop))#true population variance #Now that we have the population #draw a sample of size n=7 from this population i<-sample(1:N,n) (z_sample<-z_pop[i]) (muhat<-mean(z_sample))#sample mean (s2<-var(z_sample))#sample variance #write a user function with inputs: sample and population size #The function calculates sample mean and the variance of the sample mean mean_var<-function(smpl,N){ n=length(smpl) #this line is calculating the mean of the sample smpl_mean=mean(smpl) term1=(N-n)/N term2=var(smpl)/n #equation (2.6) page 15 var_smpl_mean=term1*term2 out=list() out$mean=smpl_mean out$var=var_smpl_mean return(out) } true_var_ofTheMean<-function(population,n){ N<-length(population) term1<-(N-n)/N term2<-var(population)/n #equation (2.5) page 15 out<-term1*term2 return(out) } #make a function call to calculate #mean and the variance of the mean estimator #for a given sample out_ls=mean_var(smpl=z_sample,N=N) #and the true variance of the #mean estimate using the population variance true_var=true_var_ofTheMean(population=z_pop,n=n) #exercise: write a function that takes the sample and population size as input and gives #the sample mean, sample variance, estimated variance of the sample mean, estimate of the #population total and its estimated variance #simulation: study the distribution of the sample mean by repeating the previous procedure for b=10000 times #also study the effectiveness of the sampling strategy; #using the function true_var_ofTheMean we can compare the theoretical estimate of the variance #of the sample mean to the estimate #of the variance obtained from the simulation b<-10000 #defining an empty vector allocates memory in R thus avoiding large memory issues #so create an empty vector of length b #defining NA values takes a lot less memory #using the for loop we are going to fill the empty vector sample_mean=rep(NA,b) for (j in 1:b) { smpl<-sample(1:N,n) sample_mean[j]<-mean(z_pop[smpl]) } hist(z_pop,nclass = 10) #look at the sampling distribution of sample_mean hist(sample_mean) #estimate of the variance of sample_mean from the simulation var(sample_mean) #exercise: follow the changes in the histogram of the estimated mean and its variance #with changes of the sample size n #the trees dataset from the last class trees y<-trees$Volume N<-length(y) n<-10 units=1:N s<-sample(units,n) mean_var(y[s],N) ybar_trees<-c() for (j in 1:b) { s<-sample(z_pop,n) ybar_trees[j]<-mean(s) } hist(ybar_trees) var(ybar_trees) true_var_ofTheMean(z_pop,n)