#Tutorial-2 #Q: Why is var(ybar)=(N-n)sigma2/nN instead of var(ybar)=sigma2/n as it was in the math-stat class?! #We will study the variance of the sample mean as: #1. Population size, N, gets large #2. In connection with sampling with replacement #3. In the model-based approach n<-10 Nvec<-c(50,100,200,300,400,500) Ypop<-list() sig2<-c() for (i in 1:6) { Ypop[[i]]<-rnorm(Nvec[i]) sig2[i]<-var(Ypop[[i]]) } b<-10000 varhatb<-c() for (i in 1:6) { ybar<-c() for (j in 1:b) { s<-sample(Ypop[[i]],n,replace=F) ybar[j]<-mean(s) } varhatb[i]<-var(ybar) } plot(Nvec,(varhatb-sig2/n)^2,ylab="e",xlab="N") #Now we will do the same, but take another sample each time using sampling with replacement Nvec<-c(50,100,500,1000,5000) nvec<-c(10,30,50,100,500) Ypop<-list() sig2<-c() for (i in 1:5) { Ypop[[i]]<-rnorm(Nvec[i]) sig2[i]<-var(Ypop[[i]]) } b<-10000 varhatb1<-c() varhatb2<-c() for (i in 1:5) { ybar1<-c() ybar2<-c() for (j in 1:b) { s1<-sample(Ypop[[i]],nvec[i],replace=F) ybar1[j]<-mean(s1) s2<-sample(Ypop[[i]],nvec[i],replace=T) ybar2[j]<-mean(s2) } varhatb1[i]<-var(ybar1) varhatb2[i]<-var(ybar2) } plot(varhatb2,pch=16,col="red",type="l",ylab="var") lines(varhatb1,pch=16,col="blue") legend(3.5,max(varhatb2),legend=c("SRS-WR","SRS-WoR"),col=c("red","blue"),lty=c(1,1)) plot((varhatb1-varhatb2)^2,ylab="e") # Consider the example in tutorial-1 where we generated a population from a N(5,1) distribution #and then assumed that it is fixed when we ran the simulations and generated the sample from #the same population ignoring the fact that those values were generated from a model N(5,1). #Now we will run a simulation but let the population vary each time as well, i.e., we generate #a new population each time from N(5,1): N<-100 n<-10 b<-10000 ybar<-c() for (j in 1:b) { y.p<-rnorm(N,5,1) s<-sample(y.p,n)#it would be the same as generating the sample directly by rnorm(10,5,1) ybar[j]<-mean(s) } #look at the sampling distribution of ybar hist(ybar) #estimate of the variance of ybar from the simulation var(ybar) 1/n#sigma2/n fixpopvar<-function(popvar,N,n){ term1<-(N-n)/N term2<-popvar/n out<-term1*term2 return(out) } fixpopvar(1,100,10) #Recall example 2, page 17; we will do the same with a larger population and sample size, i.e., we will #look at all the possible samples from a fixed population and obtain the exact sampling #distribution of the sample mean: y.p<-1:10#the population consists of numbers 1,2,...,10 mu<-mean(y.p)#population mean sig2<-var(y.p)#population variance N<-length(y.p)#population size n<-5#sample size M<-choose(N,n)#number of possible samples (without replacement) of size n C<-combn(y.p,n)#a n*M matrix whose columns are all the possible samples of size n #to calculate the sample mean for all possible samples we just need to take the mean of #columns of C: ybar<-apply(C,2,mean)#the apply function is used to apply the "mean" function to the columns of C mean(ybar) mean(ybar)==mu#sample mean is unbiased pmf<-table(ybar)/M#exact sampling distribution of ybar, i.e., all possible values that ybar #takes over all possibe samples with the probability mass corresponding to each value #we can do the same for the sample variance and the estimate of the variance of the sample mean: plot(pmf,ylab="p(ybar)") s2<-apply(C,2,var) mean(s2) mean(s2)==sig2#sample variance is an unbiased estimator for the population variance #define a function for calculating the estimate of var(sample mean) ybarvar<-function(s,N){ n<-length(s) term1<-(N-n)/N term2<-var(s)/n out<-term1*term2 return(out) } ybar.var<-apply(C,2,ybarvar,N=100) #practice: Do the same for the estimate of population total and the estimaof its variance #as in example 2.