############################################################################ #cluster sampling design: obtain HH(PPS), HT, ratio and expansion estimator ############################################################################ #Start with an example from the class: #households are primary units, people are secondary units #N=size of primary units #M=20 secondary units (people) #one sample, 2 primary units Mi=c(4,2) ys=c(8,5) N=10 M=20 n=2 #1. PPS or HH estimator pi=Mi/M #probabilities of inclusion tau_p=1/n*sum(ys/pi) #2. Horwitz-Thomson Estimator pii=1-(1-pi)^n tauHT=sum(ys/pii) #3. ratio estimator tau_r=M*sum(ys)/sum(Mi) #4. expansion estimator tau_hat=N*mean(ys) c(tau_p,tauHT,tau_r,tau_hat) #################################### #consider again example with the households #Lets say now we have the whole population of the secondary units (people) #The total of primery units is N=3,the total of secondary units is M=20 #################################### N=3 y=c(30,20,40,50,80,20,40,50,30,20,10,30,6,25,24,34,54,34,21,23) #grocery shoppng expenditure per each of the M secondary units Mi=c(7,7,6) M=20 pi=Mi/M pii=1-(1-pi)^n tau=sum(y) #obtain HH nd HT estimates of tau using systematic sampling #primiary units are sampled from sampling with replacement and #unequal probabilities #(from just one sample here) n=2 #select n primary units startHH<-sample(1:N,n,replace=T,prob=pi) #get all the secondary units that correspond to the first primary unit s1<-seq(startHH[1],M,N) #sum the yij in the first primary unit ys=sum(y[s1]) #get all the secondary units that correspond to the second primary unit s2<-seq(startHH[2],M,N) #sum the yij in the second primary unit ys=c(ys,sum(y[s2])) (tauhatHH <-mean(ys/(pi[startHH]) )) #tau_hat_p=1/n*sum(yi/pi) page 161 startHT<-sample(1:N,n,replace=T,prob=pii) startHT=unique(startHT) ys=NULL s1<-seq(startHT[1],M,N) ys=sum(y[s1]) if (length(startHT)>1){ s2<-seq(startHT[2],M,N) ys=c(ys,sum(y[s2])) } (tauhatHT <-sum((ys/(pii[startHT]) ))) #tau_hat_HT=sum(Yi/pii) for i=1,..,nu #where nu is the number of distinct #primary units in the sample (page 162) #Now we are going to study the sampling distribution of HH and HT #using cluster sampling HHCluster<-function(n,Mi,N,M,y){ z<-pi<-c() for (i in 1:N) z<-c(z,rep(i,Mi[i])) pi=Mi/M start<-sample(1:N,n,replace=T,prob=pi) ys<-rep(NA,n) for (i in 1:n){ s=which(z==start[i]) ys[i]=sum(y[s]) } tauhatHH<-mean(ys/pi[start]) return(tauhatHH) } HTCluster<-function(n,Mi,N,M,y){ z<-pi<-c() for (i in 1:N) z<-c(z,rep(i,Mi[i])) pi=Mi/M pii=1-(1-pi)^n start<-sample(1:N,n,replace=T,prob=pii) start=unique(start) n=length(start) ys<-rep(NA,n) for (i in 1:n){ s=which(z==start[i]) ys[i]=sum(y[s]) } tauhatHT<-sum(ys/pii[start]) return(tauhatHT) } b=10000 tauhHH<-replicate(b,HHCluster(n=2,Mi=Mi,M=M,N=N,y=y)) tauhHT<-replicate(b,HTCluster(n=2,Mi=Mi,M=M,N=N,y=y)) mean(tauhHH) mean(tauhHT) boxplot(list(HT=tauhHT,PPS=tauhHH)) abline(h=tau,col='red') #take a look at histograms, we no longer have many different #values as if we had SRS; instead, the primary units are #drawn with unequal probabilities, so some of the units are more favoured hist(tauhHH) hist(tauhHT,add=T,col=rgb(1,0,0,0.5),nclass=150) ################################################################### #multistage sampling example ################################################################### #two-stage sampling: duck data; #The primary units are the columns of the data N=20 #There are 10 secondary units in each primary unit, i.e., Mi=10. (10 rows) #Choose n primary units, sample m secondary units within each primary unit. y_pop<-read.table("bwt100.dat") y_pop<-data.matrix(y_pop) tau<-sum(y_pop) dim(y_pop) N=20 Mi<-10 n=5 m=4 pu<-sample(1:N,n) yhat<-rep(NA,n) for (i in 1:n) { su<-sample(1:Mi,m) ys<-y_pop[su,pu[i]] yhat[i]<-Mi*sum(ys)/m #equation 13.1 page 173 yi_hat=(Mi/mi)*sum(yij)=Mi*yi_bar } tauhat<-N*sum(yhat)/n #equation 13.2 page 173 tau_hat=(N/n)*sum(yi_hat) twostage<-function(n,m,y_pop){ pu<-sample(1:20,n) yhat<-rep(NA,n) for (i in 1:n) { su<-sample(1:Mi,m) ys<-y_pop[su,pu[i]] yhat[i]<-Mi*sum(ys)/m } tauhat<-N*sum(yhat)/n return(tauhat) } #Let the total number of secondary units be fixed at n*m=20. #study propoerties of the estimators by changing n and m #diffferent n and m b<-10000 tauhat1<-replicate(b,twostage(5,4,y_pop)) tauhat2<-replicate(b,twostage(10,2,y_pop)) tauhat3<-replicate(b,twostage(20,1,y_pop)) #a box plot of all the three estimators box_ls<-list(tauhat1=tauhat1,tauhat2=tauhat2,tauhat3=tauhat3) boxplot(box_ls) #make overlapping histograms hist(tauhat1, main='Histogram of tauhat1, tauhat2, tauhat3',xlab='') hist(tauhat2,add=T,col=rgb(1,0,0,0.5),xlab='') hist(tauhat3,add=T,col=rgb(1,0,1,0.5),xlab='') #look at the actual means of each of the estimators mean(tauhat1);mean(tauhat2);mean(tauhat3) #look at the empiricalvariances of each of the estimators var(tauhat1);var(tauhat2);var(tauhat3)