############################################################################
#cluster sampling design: obtain HH(PPS), HT, ratio and expansion estimator
############################################################################
#Start with an example from the class:
#households are primary units, people are secondary units
#N=size of primary units
#M=20 secondary units (people) 


#one sample, 2 primary units
Mi=c(4,2)
ys=c(8,5)
N=10
M=20
n=2


#1. PPS or HH estimator
pi=Mi/M   #probabilities of inclusion
tau_p=1/n*sum(ys/pi) 

#2. Horwitz-Thomson Estimator
pii=1-(1-pi)^n
tauHT=sum(ys/pii) 

#3. ratio estimator
tau_r=M*sum(ys)/sum(Mi)
#4. expansion estimator
tau_hat=N*mean(ys)

c(tau_p,tauHT,tau_r,tau_hat)

####################################
#consider again example with the households
#Lets say now we have the whole population of the secondary units (people)
#The total of primery units is N=3,the total of secondary units is M=20
####################################
N=3
y=c(30,20,40,50,80,20,40,50,30,20,10,30,6,25,24,34,54,34,21,23) #grocery shoppng expenditure per each of the M secondary units
Mi=c(7,7,6)
M=20
pi=Mi/M
pii=1-(1-pi)^n
tau=sum(y)

#obtain HH nd HT estimates of tau using systematic sampling 
#primiary units are sampled from sampling with replacement and 
#unequal probabilities
#(from just one sample here)
n=2

#select n primary units
startHH<-sample(1:N,n,replace=T,prob=pi)
#get all the secondary units that correspond to the first primary unit
s1<-seq(startHH[1],M,N)
#sum the yij in the first primary unit
ys=sum(y[s1])
#get all the secondary units that correspond to the second primary unit
s2<-seq(startHH[2],M,N)
#sum the yij in the second primary unit
ys=c(ys,sum(y[s2]))
(tauhatHH <-mean(ys/(pi[startHH]) ))  #tau_hat_p=1/n*sum(yi/pi) page 161


startHT<-sample(1:N,n,replace=T,prob=pii)
startHT=unique(startHT)

ys=NULL
s1<-seq(startHT[1],M,N)
ys=sum(y[s1])

if (length(startHT)>1){
s2<-seq(startHT[2],M,N)
ys=c(ys,sum(y[s2]))
}

(tauhatHT <-sum((ys/(pii[startHT]) ))) #tau_hat_HT=sum(Yi/pii) for i=1,..,nu 
                                       #where nu is the number of distinct 
                                       #primary units in the sample (page 162)



#Now we are going to study the sampling distribution of HH and HT
#using cluster sampling

HHCluster<-function(n,Mi,N,M,y){
  
  z<-pi<-c()
  for (i in 1:N) z<-c(z,rep(i,Mi[i]))
  pi=Mi/M
  
  start<-sample(1:N,n,replace=T,prob=pi)
  
  ys<-rep(NA,n)
  for (i in 1:n){
    s=which(z==start[i])
    ys[i]=sum(y[s])
  }
  tauhatHH<-mean(ys/pi[start]) 
  return(tauhatHH)
}

HTCluster<-function(n,Mi,N,M,y){
  
  z<-pi<-c()
  for (i in 1:N) z<-c(z,rep(i,Mi[i]))
  pi=Mi/M
  pii=1-(1-pi)^n
  
  start<-sample(1:N,n,replace=T,prob=pii)
  start=unique(start)
  
  n=length(start)
  ys<-rep(NA,n)
  for (i in 1:n){
    s=which(z==start[i])
    ys[i]=sum(y[s])
  }
  tauhatHT<-sum(ys/pii[start]) 
  return(tauhatHT)
}
b=10000
tauhHH<-replicate(b,HHCluster(n=2,Mi=Mi,M=M,N=N,y=y))
tauhHT<-replicate(b,HTCluster(n=2,Mi=Mi,M=M,N=N,y=y))
mean(tauhHH)
mean(tauhHT)
boxplot(list(HT=tauhHT,PPS=tauhHH))
abline(h=tau,col='red')
#take a look at histograms, we no longer have many different 
#values as if we had SRS; instead, the primary units are 
#drawn with unequal probabilities, so some of the units are more favoured 
hist(tauhHH)
hist(tauhHT,add=T,col=rgb(1,0,0,0.5),nclass=150)


###################################################################
#multistage sampling example
###################################################################
#two-stage sampling: duck data; 
#The primary units are the columns of the data N=20 
#There are 10 secondary units in each primary unit, i.e., Mi=10. (10 rows)
#Choose n primary units, sample m secondary units within each primary unit. 


y_pop<-read.table("bwt100.dat")
y_pop<-data.matrix(y_pop)
tau<-sum(y_pop)

dim(y_pop)

N=20
Mi<-10
n=5
m=4
pu<-sample(1:N,n)
yhat<-rep(NA,n)
for (i in 1:n) {
  su<-sample(1:Mi,m)
  ys<-y_pop[su,pu[i]]
  yhat[i]<-Mi*sum(ys)/m  #equation 13.1 page 173   yi_hat=(Mi/mi)*sum(yij)=Mi*yi_bar
}
tauhat<-N*sum(yhat)/n  #equation 13.2 page 173    tau_hat=(N/n)*sum(yi_hat)


twostage<-function(n,m,y_pop){
  pu<-sample(1:20,n)
  yhat<-rep(NA,n)
  for (i in 1:n) {
    su<-sample(1:Mi,m)
    ys<-y_pop[su,pu[i]]
    yhat[i]<-Mi*sum(ys)/m
  }
  tauhat<-N*sum(yhat)/n
  return(tauhat)
}

#Let the total number of secondary units be fixed at n*m=20. 
#study propoerties of the estimators by changing n and m
#diffferent n and m

b<-10000

tauhat1<-replicate(b,twostage(5,4,y_pop))
tauhat2<-replicate(b,twostage(10,2,y_pop))
tauhat3<-replicate(b,twostage(20,1,y_pop))


#a box plot of all the three estimators
box_ls<-list(tauhat1=tauhat1,tauhat2=tauhat2,tauhat3=tauhat3)
boxplot(box_ls)

#make overlapping histograms
hist(tauhat1, main='Histogram of tauhat1, tauhat2, tauhat3',xlab='')
hist(tauhat2,add=T,col=rgb(1,0,0,0.5),xlab='')
hist(tauhat3,add=T,col=rgb(1,0,1,0.5),xlab='')

#look at the actual means of each of the estimators
mean(tauhat1);mean(tauhat2);mean(tauhat3)
#look at the empiricalvariances of each of the estimators
var(tauhat1);var(tauhat2);var(tauhat3)