#The goal is to estimate the population toal #for a simulated population #let y=x1+x2+some noise is the population N<-100 #population size x1<-runif(100) x2<-runif(100) y<-x1+x2+rnorm(100,0,.25) tau<-sum(y) #let the sample size be 30 n=30 #Study the estimate of tau #using different ways of stratification #this way we learn which stratification strategy gives the best #estimate of tau (smallest variance) #In principle, estimate of tau will be the most precise if #the population is stratified such that unitis within each stratum are #as similar as possible #let the number of stratum be L=3 # design 1 #look into the plot of the data plot(x1,x2) #divide the region into 3 stratum #by these two lines: lines(c(.5,0),c(0,.5)) lines(c(1,.5),c(.5,1)) #create the stratum variable z1<-length(x1+x2) z1[(x1+x2)<.5]<-1 z1[(x1+x2)>.5 & (x1+x2)<1.5]<-2 z1[(x1+x2)>1.5]<-3 tapply(y,z1,mean) table(z1) L<-3 nh1=Nh1=tauh1= numeric(L) for (h in 1:L) { Nh1[h] = length(z1[z1==h]) nh1[h] = round(n*Nh1[h]/N) #page 147 sample size for stratum h, stratums are different in size alo known as proportional allocation s=sample(1:Nh1[h],nh1[h]) points(x1[z1==h][s],x2[z1==h][s],col="red") tauh1[h]<-Nh1[h]*mean(y[z1==h][s]) } tauhat1<-sum(tauh1) #ubiased estimate of tau tau_st_hat=sum(tau_hat_h) page 142 formula #Lets take a look into #sample size in each of the stratum nh1 #pop size in each of the stratum Nh1 # design 2 #change the boundaries plot(x1,x2) lines(c(.5,1),c(0,.5)) lines(c(0,.5),c(.5,1)) L<-3 z2<-length(x1+x2) z2[(x1-x2)< -.5]<-1 z2[(x1-x2)> -.5 & (x1-x2)<.5]<-2 z2[(x1-x2)>.5]<-3 #let's take a look into the mean of each stratum tapply(y,z2,mean) table(z2) tapply(y,z2,var) tapply(y,z2,sd) nh2=Nh2=tauh2=numeric(L) for (h in 1:L) { Nh2[h]<-length(z2[z2==h]) nh2[h]<-round(n*Nh2[h]/N) s<-sample(1:Nh2[h],nh2[h]) points(x1[z2==h][s],x2[z2==h][s],col="red") tauh2[h]<-Nh2[h]*mean(y[z2==h][s]) } tauhat2<-sum(tauh2) # design 3 plot(x1,x2) lines(c(.2,.2),c(0,1)) lines(c(.8,.8),c(0,1)) z3<-length(x1+x2) z3[x1<.2]<-1 z3[x1>.2 & x1<.8]<-2 z3[x1>.8]<-3 L<-3 nh3=Nh3=tauh3=numeric(L) for (h in 1:L) { Nh3[h] = length(z3[z3==h]) nh3[h] = round(n*Nh3[h]/N) s=sample(1:Nh3[h],nh3[h]) points(x1[z3==h][s],x2[z3==h][s],col="red") tauh3[h]<-Nh3[h]*mean(y[z3==h][s]) } tauhat3<-sum(tauh3) #Exercises: #1. Find the best of the three designs above by using #optimal allocation of n among the L strata i.e. use formula page 147: #n_h=n*Nh*sigma_h/sum{k=1_L} Nk*sigmak #2. calculate var_hat_tau_st = var_hat_tauh1+var_hat_tauh2+var_hat_tauh3 # where var_hat_tauh1= sum{over h=1,..,L}(N_h^2*(1-nh/Nh)*sn^2/nh) #where sn^2=1/(nh-1) sum(over i in the sample) (yhi-yh_bar)^2 # analogously for var_hat_tauh2,var_hat_tauh3 #simulation study to compare the three stratification designs #obtain the tau_hat estimate from b samples from each of the stratum b = 10000 tauhat1 = numeric(b) for (i in 1:b){ nh1=Nh1=tauh1=numeric(L) for (h in 1:L) { Nh1[h]<-length(z1[z1==h]) nh1[h]<-round(n*Nh1[h]/N) s<-sample(1:Nh1[h],nh1[h]) tauh1[h]<-Nh1[h]*mean(y[z1==h][s]) } tauhat1[i]<-sum(tauh1) } #do the same for the second design tauhat2<-numeric(b) for (i in 1:b){ nh2=Nh2=tauh2=numeric(L) for (h in 1:L) { Nh2[h]<-length(z2[z2==h]) nh2[h]<-round(n*Nh2[h]/N) s<-sample(1:Nh2[h],nh2[h]) tauh2[h]<-Nh2[h]*mean(y[z2==h][s]) } tauhat2[i]<-sum(tauh2) } #and the same with the third design tauhat3<-rep(NA,b) for (i in 1:b) { nh3=Nh3=tauh3=numeric(b) for (h in 1:L) { Nh3[h]<-length(z3[z3==h]) nh3[h]<-round(n*Nh3[h]/N) s<-sample(1:Nh3[h],nh3[h]) tauh3[h]<-Nh3[h]*mean(y[z3==h][s]) } tauhat3[i]<-sum(tauh3) } par(mfrow=c(2,2)) l1<-min(c(tauhat1,tauhat2,tauhat3)) l2<-max(c(tauhat1,tauhat2,tauhat3)) hist(tauhat1,xlim=c(l1,l2),col=rgb(1,1,0,0.5),prob=T,nclass=50) abline(v=tau,col="red",lwd=2) hist(tauhat2,xlim=c(l1,l2),col=rgb(1,0,0,0.4),prob=T,nclass=50) abline(v=tau,col="red",lwd=2) hist(tauhat3,xlim=c(l1,l2),col=rgb(0,0,1,0.2),prob=T,nclass=50) abline(v=tau,col="red",lwd=2) par(mfrow=c(1,1)) hist(tauhat1,col=rgb(1,1,0,0.5),prob=T,nclass=50,main='Histogram of tauhat1,tauhat2,tauhat3',xlab='tauhat') hist(tauhat3,add=T,col=rgb(1,0,0,0.4),prob=T,nclass=50,xlab='') hist(tauhat2,add=T,col=rgb(0,0,1,0.2),prob=T,nclass=50,xlab='') abline(v=tau,col="red",lwd=2) mean(tauhat1) mean(tauhat2) mean(tauhat3) tau #take a look into the variances of the three estimators var(tauhat1) var(tauhat2) var(tauhat3) ################################################################################## #Finite Population Bootstrap - use it do decide which sampling design to choose ################################################################################## #FPB generates an artificial population, the so-called bootstrap population #by resampling values from each of the stratums in the original sample. #So we are creating a new bootstrap population that is arguably realistic #but does not need to be generated in reality when we only have the sample. #the bootstrap population is generated to be as large as the realistic one and have values #that are as realisitc as possible. #Bootstrap technique is used only for #the simulation purposes to generate population #and learn about sampling distribution of the estimator. #Finite Population Bootstrap works well with SRS and stratified sampling #it will get more complicated with more complex sampling designs. #We will use the dta from the third design stratum<-z1 n1<-length(y[stratum==1]) n2<-length(y[stratum==2]) n3<-length(y[stratum==3]) tapply(y,stratum,mean) tapply(y,stratum,var) ######construct a population with strata sizes: c(n1,n2,n3) N1<-57 N2<-122 N3<-50 repeatStrata1<-floor(N1/n1) repeatStrata2<-floor(N2/n2) repeatStrata3<-floor(N3/n3) res1<-N1-repeatStrata1*n1 res2<-N2-repeatStrata2*n2 res3<-N3-repeatStrata3*n3 #augment the population by creating a 'bootstrap' population y1<-y[stratum==1] y2<-y[stratum==2] y3<-y[stratum==3] y1aug<-c(rep(y1,repeatStrata1),sample(y1,res1)) y2aug<-c(rep(y2,repeatStrata2),sample(y2,res2)) y3aug<-c(rep(y3,repeatStrata3),sample(y3,res3)) #true population tau of the augmented population tau<-sum(y1aug)+sum(y2aug)+sum(y3aug) tauhat<-numeric(b) for (k in 1:b) { s1<-sample(N1,n1) tauhat1[k]<-N1*mean(y1aug[s1]) s2<-sample(N2,n2) tauhat2[k]<-N2*mean(y2aug[s2]) s3<-sample(N3,n3) tauhat3[k]<-N3*mean(y3aug[s3]) tauhat[k]<-tauhat1[k]+tauhat2[k]+tauhat3[k] } par(mfrow=c(1,1)) hist(tauhat,nclass=30, col=rgb(1,1,0,0.2),prob=T) abline(v=tau,col="red",lwd=2) #Bias, variance and MSE of tau_hat (bias=mean(tauhat)-tau) (var_tau_hat=var(tauhat)) var(tauhat1)+var(tauhat2)+var(tauhat3) (MSE_tau_hat=var_tau_hat+bias^2)