#study the estimate of tau using three different types of allocation #in stratification designs #choose the best stratification strategy from the tutorial-8.r N<-100 #population size x1<-runif(100) x2<-runif(100) y<-x1+x2+rnorm(100,0,.25) tau<-sum(y) L=3 #let the sample size be 30 n=30 # design 1 #look into the plot of the data plot(x1,x2) #divide the region into 3 stratum #by these two lines: lines(c(.5,0),c(0,.5)) lines(c(1,.5),c(.5,1)) #create the stratum variable z1<-length(x1+x2) z1[(x1+x2)<.5]<-1 z1[(x1+x2)>.5 & (x1+x2)<1.5]<-2 z1[(x1+x2)>1.5]<-3 tapply(y,z1,mean) table(z1) #constant allocation #same size stratums b = 10000 tauhat1 = rep(NA,b) #nh1=c(rep(10,L)) nh1=c(rep(n/L,L)) # page 146 nh=n/L for (i in 1:b){ Nh1=tauh1=rep(NA,L) for (h in 1:L) { Nh1[h]<-length(z1[z1==h]) s<-sample(1:Nh1[h],nh1[h]) tauh1[h]<-Nh1[h]*mean(y[z1==h][s]) } tauhat1[i]<-sum(tauh1) } #proportional allocation b = 10000 tauhat2 = rep(NA,b) for (i in 1:b){ nh2=Nh2=tauh2=rep(NA,L) for (h in 1:L) { Nh2[h]<-length(z1[z1==h]) nh2[h]<-round(n*Nh2[h]/N) s<-sample(1:Nh2[h],nh2[h]) tauh2[h]<-Nh2[h]*mean(y[z1==h][s]) } tauhat2[i]<-sum(tauh2) } #optimal allocation use formula nh=n*Nh*sigma_h/sum(Nh*sigma_h) b = 10000 tauhat3 = rep(NA,b) sigma=tapply(y,z1,var) Nh3=tapply(y,z1,length) denom=sum(sigma*Nh3) for (i in 1:b){ nh3=tauh3=rep(NA,L) for (h in 1:L) { nh3[h]<-round(n*Nh3[h]*sigma[h]/denom) s<-sample(1:Nh3[h],nh3[h]) tauh3[h]<-Nh3[h]*mean(y[z1==h][s]) } tauhat3[i]<-sum(tauh3) } par(mfrow=c(2,2)) hist(tauhat1,col=rgb(1,1,0,0.5),prob=T,nclass=50) abline(v=tau,col="red",lwd=2) hist(tauhat2,col=rgb(1,0,0,0.4),prob=T,nclass=50) abline(v=tau,col="red",lwd=2) hist(tauhat3,col=rgb(0,0,1,0.2),prob=T,nclass=50) abline(v=tau,col="red",lwd=2) var(tauhat1) var(tauhat2) var(tauhat3) par(mfrow=c(1,1)) hist(tauhat3,col=rgb(1,1,0,0.5),prob=T,nclass=50,main='Histogram of tauhat1,tauhat2,tauhat3',xlab='tauhat') hist(tauhat2,add=T,col=rgb(1,0,0,0.4),prob=T,nclass=50,xlab='') hist(tauhat1,add=T,col=rgb(0,0,1,0.2),prob=T,nclass=50,xlab='') abline(v=tau,col="red",lwd=2) #systematic sampling. #we want to sample at a site every 10-th day of the year, #that is the number of primary units is N=10. #M=365 is total number of secondary units #Primary units are of unequal size since 365/10 is not an integer M<-364 #systematic sample with one primary unit load("rainfall-data.gz") y<-rain #this is the variable of interest tau=sum(y)#this is the total amount of rainfall for the year, the quantity that we wish to estimate #1 primary unit #m=round(M/k) secondary units n=1 k<-10 start<-sample(1:k,n) s<-seq(start,M,k) muhat<-(k/M)*sum(y[s]) #unbiased estimate of mean daily rainfall m=length(s) #number of secondary units #M is the total number of posible secondary units #Now, lets fix the number of secondary units (days) at m=50 n=1 m<-50 #systematic sample with one primary unit k<-floor(M/m) start<-sample(1:k,n) s<-seq(start,M,k) colorVec=rep('darkgray',M) colorVec[s]='red' plot(1:M,y, main='Rainfall data over the year,\n for 1:365 days, 1 primary unit',col=colorVec) tauhat1<-k*sum(y[s])#unbiased estimate of the total rainfall from systematic sample #with one primary unit #systematic sampling with 2 primary units n<-2 k<-floor(M/(m/2)) start<-sample(1:k,n) s1<-seq(start[1],M,k) s2<-seq(start[2],M,k) #s_2PrimUnit=c(s1,s2) colorVec=rep('darkgray',M) colorVec[c(s1,s2)]='red' plot(1:M,y, main='Rainfall data over the year,\n for 1:365 days, 2 primary units',col=colorVec) #y[s_2PrimUnit] N=k tauhat2<-k*(mean(c(sum(y[s1]),sum(y[s2])))) #unbiased estimate of the total rainfall #from systematic sample with #two primary units #check the lengths of the samples drawn using systematic sampling length(s) length(c(s1,s2)) # Cluster sampling: # The population is divided into N groups i.e., clusters. # we randomly select n clusters out of N to include in the sample. # The number of observations within each cluster Mi is known and also # M = M1 + M2 + M3 + ... + MN-1 + MN. # All of the elements within selected clusters are included in the sample. #cluster sampling with n clusters of size Mi (here we choose n and Mi such that #n*Mi=50, the number of secondary sampling units) M=364 n=50 Mi=2 z=NULL N <-M/Mi #number of clusters in the population for (i in 1:N) z<-c(z,rep(i,Mi)) #draw n clusters (primary units) start<-sample(1:N,n) ys<-rep(NA,n) for (i in 1:n){ yi<-y[which(z==start[i])] ys[i]<-sum(yi) } tauhat_cluster<-N*mean(ys) tau=sum(y)