#study the estimate of tau using three different types of allocation 
#in stratification designs
#choose the best stratification strategy from the tutorial-8.r

N<-100 #population size
x1<-runif(100)
x2<-runif(100)
y<-x1+x2+rnorm(100,0,.25)
tau<-sum(y)
L=3
#let the sample size be 30
n=30 


# design 1
#look into the plot of the data
plot(x1,x2)
#divide the region into 3 stratum
#by these two lines:
lines(c(.5,0),c(0,.5))
lines(c(1,.5),c(.5,1))
#create the stratum variable
z1<-length(x1+x2)
z1[(x1+x2)<.5]<-1
z1[(x1+x2)>.5 & (x1+x2)<1.5]<-2
z1[(x1+x2)>1.5]<-3

tapply(y,z1,mean)
table(z1)


#constant allocation  #same size stratums
b = 10000
tauhat1 = rep(NA,b)
#nh1=c(rep(10,L))
nh1=c(rep(n/L,L))  # page 146 nh=n/L
for (i in 1:b){
  Nh1=tauh1=rep(NA,L)
  for (h in 1:L) {
    Nh1[h]<-length(z1[z1==h])
    s<-sample(1:Nh1[h],nh1[h])
    tauh1[h]<-Nh1[h]*mean(y[z1==h][s])
  }
  tauhat1[i]<-sum(tauh1)
}




#proportional allocation
b = 10000
tauhat2 = rep(NA,b)
for (i in 1:b){
  nh2=Nh2=tauh2=rep(NA,L)
  for (h in 1:L) {
    Nh2[h]<-length(z1[z1==h])
    nh2[h]<-round(n*Nh2[h]/N)
    s<-sample(1:Nh2[h],nh2[h])
    tauh2[h]<-Nh2[h]*mean(y[z1==h][s])
  }
  tauhat2[i]<-sum(tauh2)
}


#optimal allocation  use formula nh=n*Nh*sigma_h/sum(Nh*sigma_h)
b = 10000
tauhat3 = rep(NA,b)
sigma=tapply(y,z1,var)
Nh3=tapply(y,z1,length)
denom=sum(sigma*Nh3)
for (i in 1:b){
  nh3=tauh3=rep(NA,L)
  for (h in 1:L) {
    nh3[h]<-round(n*Nh3[h]*sigma[h]/denom)
    s<-sample(1:Nh3[h],nh3[h])
    tauh3[h]<-Nh3[h]*mean(y[z1==h][s])
  }
  tauhat3[i]<-sum(tauh3)
}




par(mfrow=c(2,2))
hist(tauhat1,col=rgb(1,1,0,0.5),prob=T,nclass=50)
abline(v=tau,col="red",lwd=2)
hist(tauhat2,col=rgb(1,0,0,0.4),prob=T,nclass=50)
abline(v=tau,col="red",lwd=2)
hist(tauhat3,col=rgb(0,0,1,0.2),prob=T,nclass=50)
abline(v=tau,col="red",lwd=2)

var(tauhat1)
var(tauhat2)
var(tauhat3)


par(mfrow=c(1,1))
hist(tauhat3,col=rgb(1,1,0,0.5),prob=T,nclass=50,main='Histogram of tauhat1,tauhat2,tauhat3',xlab='tauhat')
hist(tauhat2,add=T,col=rgb(1,0,0,0.4),prob=T,nclass=50,xlab='')
hist(tauhat1,add=T,col=rgb(0,0,1,0.2),prob=T,nclass=50,xlab='')
abline(v=tau,col="red",lwd=2)


#systematic sampling.
#we want to sample at a site every 10-th day of the year, 
#that is the number of primary units is N=10.
#M=365 is total number of secondary units
#Primary units are of unequal size since 365/10 is not an integer

M<-364
#systematic sample with one primary unit
load("rainfall-data.gz")
y<-rain #this is the variable of interest
tau=sum(y)#this is the total amount of rainfall for the year, the quantity that we wish to estimate

#1 primary unit
#m=round(M/k) secondary units
n=1
k<-10
start<-sample(1:k,n)
s<-seq(start,M,k)
muhat<-(k/M)*sum(y[s]) #unbiased estimate of mean daily rainfall
m=length(s) #number of secondary units 

#M  is the total number of posible secondary units

#Now, lets fix the number of secondary units (days) at m=50

n=1
m<-50

#systematic sample with one primary unit
k<-floor(M/m)
start<-sample(1:k,n)
s<-seq(start,M,k)

colorVec=rep('darkgray',M)
colorVec[s]='red'
plot(1:M,y, main='Rainfall data over the year,\n for 1:365 days, 1 primary unit',col=colorVec)

tauhat1<-k*sum(y[s])#unbiased estimate of the total rainfall from systematic sample
#with one primary unit

#systematic sampling with 2 primary units
n<-2
k<-floor(M/(m/2))
start<-sample(1:k,n)
s1<-seq(start[1],M,k)
s2<-seq(start[2],M,k)
#s_2PrimUnit=c(s1,s2)

colorVec=rep('darkgray',M)
colorVec[c(s1,s2)]='red'
plot(1:M,y, main='Rainfall data over the year,\n for 1:365 days, 2 primary units',col=colorVec)

#y[s_2PrimUnit] 
N=k
tauhat2<-k*(mean(c(sum(y[s1]),sum(y[s2])))) #unbiased estimate of the total rainfall 
#from systematic sample with
#two primary units


#check the lengths of the samples drawn using systematic sampling
length(s)
length(c(s1,s2))



# Cluster sampling:
# The population is divided into N groups i.e., clusters.
# we randomly select n clusters out of N to include in the sample.
# The number of observations within each cluster Mi is known and also
# M = M1 + M2 + M3 + ... + MN-1 + MN.
# All of the elements within selected clusters are included in the sample.

#cluster sampling with n clusters of size Mi (here we choose n and Mi such that 
#n*Mi=50, the number of secondary sampling units)

M=364
n=50
Mi=2
z=NULL
N <-M/Mi #number of clusters in the population

for (i in 1:N) z<-c(z,rep(i,Mi))
#draw n clusters (primary units)
start<-sample(1:N,n)

ys<-rep(NA,n)
for (i in 1:n){
  yi<-y[which(z==start[i])]
  ys[i]<-sum(yi)
}

tauhat_cluster<-N*mean(ys)
tau=sum(y)