#Tutorial 2 #We will study the variance of the sample mean as: #1. Population size, N, gets large #2. As sampling fraction n/N changes #3. all possible samples to obtain sample variance as an unbiased estimate of the population variance # - Obtain confidence intervals of smple mean and total population #Excercise 1. #generate a list of populations with different sizes #and calculate mean and variance for #each of the populations n<-7 N_vector<-c(50,100,200,300,400,500) Zpop<-list() mean_pop_vector=rep(NA,length(N_vector)) sigma2_pop_vector=rep(NA,length(N_vector)) for (i in 1:length(N_vector)) { Zpop[[i]]<-rnorm(N_vector[i]) mean_pop_vector[i]<-mean(Zpop[[i]]) sigma2_pop_vector[i]<-var(Zpop[[i]]) } #now calculate variances of the sample means #for each of the populations with different sizes #sample b samples and calculate means #then calculate variances of each of the means and make a plot #study the sample variance as population size N gets bigger b<-10000 varhatb<-rep(NA,length(N_vector)) for (i in 1:length(N_vector)) { ybar<-rep(NA,b) for (j in 1:b) { s<-sample(Zpop[[i]],n,replace=F) ybar[j]<-mean(s) } varhatb[i]<-var(ybar) } plot(N_vector,(varhatb-sigma2_pop_vector/n)^2,ylab="error",xlab="Population size N") #Excercise 2. #Question: Why is var(ybar)=(N-n)sigma2/nN instead of var(ybar)=sigma2/n as it was in the math-stat class?! # We will study this on a population generated from a N(10,1) distribution #try these values of N and n #N<-10000 #n<-200 N<-10000 n<-50 b<-10000 ybar<-rep(NA,b) zpop<-rnorm(N,10,1) for (j in 1:b) { s<-sample(zpop,n) ybar[j]<-mean(s) } #look at the sampling distribution of ybar hist(ybar) #estimate of the variance of ybar from the simulation var(ybar) #the population variance is sigma2=1 (because zpop was generated from nrom(10,1)) sigma2=1 #this is the second term of the formula for the variance of var(ybar)=sigma2/n sigma2/n #now include the correction factor in the formula for var(ybar)=((N-n)/N)*sigma2/n correctpopvar<-function(popvar,N,n){ term1<-(N-n)/N term2<-popvar/n out<-term1*term2 return(out) } #run the correction of the variance of ybar correctpopvar(1,N,n) #Excercise 3. #Like in example 2, page 16, we will #exhaust all the possible samples from a fixed population and obtain the exact sampling #distribution of the sample mean: Zpop<-1:10#the population consists of numbers 1,2,...,10 mu<-mean(Zpop)#population mean sigma2<-var(Zpop)#population variance N<-length(Zpop)#population size n<-5#sample size M<-choose(N,n)#number of possible samples (without replacement) of size n Comb<-combn(Zpop,n)#a n*M matrix with all posible samples (of size n) on the columns #to calculate the sample mean for each possible samples we use the apply function #which takes mean for each of the columns (i.e., samples) of the matrix Comb ybar<-apply(Comb,2,mean) #look at the distribution of ybar hist(ybar) #compare the mean of the ybar to the population mean mean(ybar) (mean(ybar)==mu)#sample mean is unbiased #we can do the same for the sample variance and the estimate of the variance of the sample mean: s2<-apply(Comb,2,var) #sample variance is an unbiased estimator of the population variance mean(s2)==sigma2 #exact sampling distribution of ybar, i.e., the frequency of each value that ybar takes #normalized by the number of all possibe samples pmf<-table(ybar)/M plot(pmf,ylab="p(ybar)") #define a function for calculating the estimate of var(sample mean) ybarvar<-function(smpl,N){ n<-length(smpl) term1<-(N-n)/N term2<-var(smpl)/n out<-term1*term2 return(out) } #calculate variance of each of the sample means #and use it to obtin 95% confidence interval #now we use the user specified function (ybarvar) to be applied to each column of the matrix Comb #Note that first argument of ybarvar is smpl and when ybarvar is called from #the apply function the argument smpl is passed as a column of the Comb function #so the first argument in the user specified function sould correspond to what we have #on the columns (or could be rows) of the matrix (in this case Comb) which is passed #through the apply function. The second argument of ybarvar is passed after the function (i.e, ybarvar,N=10) # ybar.var<-apply(Comb,2,ybarvar,N=N) mat_95CI_ybar.var=cbind(ybar+qt(0.025,n-1)*sqrt(ybar.var),ybar+qt(0.975,n-1)*sqrt(ybar.var)) # The matrix mat_95CI_ybar.var contains confidence intervalsfor each of the # sample means stored in the ybar variable # Repeat the same procedure to obtain estimate of population total and the estimate of its variance tau_hat=N*ybar mat_95tau_hat=cbind(tau_hat+qt(0.025,n-1)*N*sqrt(ybar.var),ybar+qt(0.975,n-1)*N*sqrt(ybar.var)) #alternatively the variance of the total population estimator could be estimated by this function tau_hat_var_hat<-function(ybar.var,N){ out=N*sqrt(ybar.var) return(out) } mat_95tau_hat=cbind(tau_hat+qt(0.025,n-1)*tau_hat_var_hat(ybar.var,N),tau_hat+qt(0.975,n-1)*tau_hat_var_hat(ybar.var,N))