#Tutorial-9
#we estimate the variance of the estimate of the population total when using a systematic
#sample with one primary unit;
#the data comprise counts of a certain species of ducks observed in N=200 quadrats
#want to estimate the total count using a systematic sample of size n=20
#then we calculate estimates of the variance of this estimate using the following methods
#1. simulation
#2. treating the systematic sample as a simple random sample
#3. post stratify the population and calculate the stratified variance

pop<-read.table("bwt100.dat")
pop<-data.matrix(pop)
tau<-sum(pop)
N<-length(c(pop))
n<-20
k<-N/n
start<-sample(1:k,1)
sys<-seq(start,N,k)
tauhat<-k*sum(pop[start,])

#we can actually calculate the exact variance since there are only 10 possible systematic
#samples and the exact sampling dist of tauhat can be obtained
tauhat<-c()
for (i in 1:k) {
  start<-i
  sys<-seq(start,N,k)
  tauhat[i]<-k*sum(pop[start,])
}
table(tauhat)
vartrue<-var(tauhat)
sdtrue<-sqrt(vartrue)


#estimate the variance with simulation
b<-10000
tauhat<-c()
for (i in 1:b) {
  start<-sample(1:k,1)
  sys<-seq(start,N,k)
  tauhat[i]<-k*sum(pop[start,])
}
varsim<-var(tauhat)
sdsim<-sqrt(varsim)

#treat the sys sample as an SRS
start<-sample(1:k,1)
sys<-seq(start,N,k)
tauhat<-k*sum(pop[start])
varsrs<-(N-n)*var(pop[start,])/n
varsrs

#post-stratification: 1. define each 2-column of the matrix pop as a stratum; calculate 
#both the stratification and post-stratification variance
H<-10
Nh<-rep(20,H)
nh<-rep(2,H)
start<-sample(1:k,1)
sys<-seq(start,N,k)
s2h<-c()
for (h in 1:H) {
  samp<-sys[(2*h-1):(2*h)]
  s2h[h]<-var(pop[samp])
}
varst<-sum(Nh*(Nh-nh)*s2h/nh)
varst

#practice: 1. try this with strata of different sizes, 2. try this on the rainfall data

#two-stage sampling for the duck data; let the primary units be the columns of the data 
#matrix pop, i.e., N=20. There are 10 secondary units in each primary unit, i.e., Mi=10.
#We choose n primary units and take m secondary units inside each. 
#Let the total number of secondary units be fixed at n*m=20. We will try n=5,m=4; n=10,m=2;
#n=20,m=1.

N=20
M<-10
n=5
m=4
ps<-sample(1:20,n)
yhat<-c()
for (h in 1:n) {
  ss<-sample(1:M,m)
  ys<-pop[ss,ps[h]]
  yhat[h]<-M*sum(ys)/m
}
tauhat<-N*sum(yhat)/n

twostage<-function(n,m){
  ps<-sample(1:20,n)
  yhat<-c()
  for (h in 1:n) {
    ss<-sample(1:M,m)
    ys<-pop[ss,ps[h]]
    yhat[h]<-M*sum(ys)/m
  }
  tauhat<-N*sum(yhat)/n
  return(tauhat)
}


b<-10000
tauhat1<-c()
tauhat2<-c()
tauhat3<-c()
for (i in 1:b) {
  tauhat1[i]<-twostage(5,4)
  tauhat2[i]<-twostage(10,2)
  tauhat3[i]<-twostage(20,1)
}

box<-list(tauhat1,tauhat2,tauhat3)
boxplot(box)