#The goal is to estimate the population toal 
#for a simulated population
#let y=x1+x2+some noise is the population 
N<-100 #population size
x1<-runif(100)
x2<-runif(100)
y<-x1+x2+rnorm(100,0,.25)
tau<-sum(y)

#let the sample size be 30
n=30 

#Study the estimate of tau
#using different ways of stratification
#this way we learn which stratification strategy gives the best 
#estimate of tau (smallest variance)
#In principle, estimate of tau will be the most precise if
#the population is stratified such that unitis within each stratum are 
#as similar as possible
#let the number of stratum be L=3


# design 1
#look into the plot of the data
plot(x1,x2)
#divide the region into 3 stratum
#by these two lines:
lines(c(.5,0),c(0,.5))
lines(c(1,.5),c(.5,1))
#create the stratum variable
z1<-length(x1+x2)
z1[(x1+x2)<.5]<-1
z1[(x1+x2)>.5 & (x1+x2)<1.5]<-2
z1[(x1+x2)>1.5]<-3

tapply(y,z1,mean)
table(z1)

L<-3
nh1=Nh1=tauh1= numeric(L)
for (h in 1:L) {
  Nh1[h] = length(z1[z1==h])  
  nh1[h] = round(n*Nh1[h]/N) #page 147 sample size for stratum h, stratums are different in size alo known as proportional allocation
  s=sample(1:Nh1[h],nh1[h])
  points(x1[z1==h][s],x2[z1==h][s],col="red")
  tauh1[h]<-Nh1[h]*mean(y[z1==h][s])
}
tauhat1<-sum(tauh1) #ubiased estimate of tau tau_st_hat=sum(tau_hat_h) page 142 formula

#Lets take a look into
#sample size in each of the stratum
nh1
#pop size in each of the stratum
Nh1

# design 2
#change the boundaries
plot(x1,x2)
lines(c(.5,1),c(0,.5))
lines(c(0,.5),c(.5,1))
L<-3
z2<-length(x1+x2)
z2[(x1-x2)< -.5]<-1
z2[(x1-x2)> -.5 & (x1-x2)<.5]<-2
z2[(x1-x2)>.5]<-3
#let's take a look into the mean of each stratum
tapply(y,z2,mean)
table(z2)

tapply(y,z2,var)
tapply(y,z2,sd)


nh2=Nh2=tauh2=numeric(L)
for (h in 1:L) {
  Nh2[h]<-length(z2[z2==h])
  nh2[h]<-round(n*Nh2[h]/N)
  s<-sample(1:Nh2[h],nh2[h])
  points(x1[z2==h][s],x2[z2==h][s],col="red")
  tauh2[h]<-Nh2[h]*mean(y[z2==h][s])
}
tauhat2<-sum(tauh2)


# design 3
plot(x1,x2)
lines(c(.2,.2),c(0,1))
lines(c(.8,.8),c(0,1))
z3<-length(x1+x2)
z3[x1<.2]<-1
z3[x1>.2 & x1<.8]<-2
z3[x1>.8]<-3
L<-3
nh3=Nh3=tauh3=numeric(L)


for (h in 1:L) {
  Nh3[h] = length(z3[z3==h])
  nh3[h] = round(n*Nh3[h]/N)
  s=sample(1:Nh3[h],nh3[h])
  points(x1[z3==h][s],x2[z3==h][s],col="red")
  tauh3[h]<-Nh3[h]*mean(y[z3==h][s])
}
tauhat3<-sum(tauh3)


#Exercises:
#1. Find the best of the three designs above by using
#optimal allocation of n among the L strata i.e. use formula page 147:
#n_h=n*Nh*sigma_h/sum{k=1_L} Nk*sigmak
#2. calculate var_hat_tau_st = var_hat_tauh1+var_hat_tauh2+var_hat_tauh3
# where var_hat_tauh1= sum{over h=1,..,L}(N_h^2*(1-nh/Nh)*sn^2/nh)
#where sn^2=1/(nh-1) sum(over i in the sample) (yhi-yh_bar)^2
# analogously for var_hat_tauh2,var_hat_tauh3

#simulation study to compare the three stratification designs
#obtain the tau_hat estimate from b samples from each of the stratum
b = 10000
tauhat1 = numeric(b)
for (i in 1:b){
  nh1=Nh1=tauh1=numeric(L)
  for (h in 1:L) {
    Nh1[h]<-length(z1[z1==h])
    nh1[h]<-round(n*Nh1[h]/N)
    s<-sample(1:Nh1[h],nh1[h])
    tauh1[h]<-Nh1[h]*mean(y[z1==h][s])
  }
  tauhat1[i]<-sum(tauh1)
}

#do the same for the second design
tauhat2<-numeric(b)
for (i in 1:b){
  nh2=Nh2=tauh2=numeric(L)
  for (h in 1:L) {
    Nh2[h]<-length(z2[z2==h])
    nh2[h]<-round(n*Nh2[h]/N)
    s<-sample(1:Nh2[h],nh2[h])
    tauh2[h]<-Nh2[h]*mean(y[z2==h][s])
  }
  tauhat2[i]<-sum(tauh2)
}

#and the same with the third design

tauhat3<-rep(NA,b)
for (i in 1:b) {
  nh3=Nh3=tauh3=numeric(b)
  for (h in 1:L) {
    Nh3[h]<-length(z3[z3==h])
    nh3[h]<-round(n*Nh3[h]/N)
    s<-sample(1:Nh3[h],nh3[h])
    tauh3[h]<-Nh3[h]*mean(y[z3==h][s])
  }
  tauhat3[i]<-sum(tauh3)
}


par(mfrow=c(2,2))
l1<-min(c(tauhat1,tauhat2,tauhat3))
l2<-max(c(tauhat1,tauhat2,tauhat3))
hist(tauhat1,xlim=c(l1,l2),col=rgb(1,1,0,0.5),prob=T,nclass=50)
abline(v=tau,col="red",lwd=2)
hist(tauhat2,xlim=c(l1,l2),col=rgb(1,0,0,0.4),prob=T,nclass=50)
abline(v=tau,col="red",lwd=2)
hist(tauhat3,xlim=c(l1,l2),col=rgb(0,0,1,0.2),prob=T,nclass=50)
abline(v=tau,col="red",lwd=2)


par(mfrow=c(1,1))
hist(tauhat1,col=rgb(1,1,0,0.5),prob=T,nclass=50,main='Histogram of tauhat1,tauhat2,tauhat3',xlab='tauhat')
hist(tauhat3,add=T,col=rgb(1,0,0,0.4),prob=T,nclass=50,xlab='')
hist(tauhat2,add=T,col=rgb(0,0,1,0.2),prob=T,nclass=50,xlab='')
abline(v=tau,col="red",lwd=2)


mean(tauhat1)
mean(tauhat2)
mean(tauhat3)
tau
#take a look into the variances of the three estimators
var(tauhat1)
var(tauhat2)
var(tauhat3)







##################################################################################
#Finite Population Bootstrap - use it do decide which sampling design to choose
##################################################################################
#FPB generates an artificial population, the so-called bootstrap population
#by resampling values from each of the stratums in the original sample.
#So we are creating a new bootstrap population that is arguably realistic
#but does not need to be generated in reality when we only have the sample.
#the bootstrap population is generated to be as large as the realistic one and have values
#that are as realisitc as possible.

#Bootstrap technique is used only for 
#the simulation purposes to generate population
#and learn about sampling distribution of the estimator.
#Finite Population Bootstrap  works well with SRS and stratified sampling
#it will get more complicated with more complex sampling designs.


#We will use the dta from the third design
stratum<-z1

n1<-length(y[stratum==1])
n2<-length(y[stratum==2])
n3<-length(y[stratum==3])
tapply(y,stratum,mean)
tapply(y,stratum,var)
######construct a population with strata sizes:
c(n1,n2,n3)

N1<-57
N2<-122
N3<-50

repeatStrata1<-floor(N1/n1)
repeatStrata2<-floor(N2/n2)
repeatStrata3<-floor(N3/n3)
res1<-N1-repeatStrata1*n1
res2<-N2-repeatStrata2*n2
res3<-N3-repeatStrata3*n3

#augment the population by creating a 'bootstrap' population
y1<-y[stratum==1]
y2<-y[stratum==2]
y3<-y[stratum==3]

y1aug<-c(rep(y1,repeatStrata1),sample(y1,res1))
y2aug<-c(rep(y2,repeatStrata2),sample(y2,res2))
y3aug<-c(rep(y3,repeatStrata3),sample(y3,res3))
#true population tau of the augmented population
tau<-sum(y1aug)+sum(y2aug)+sum(y3aug)

tauhat<-numeric(b)
for (k in 1:b) {
  s1<-sample(N1,n1)
  tauhat1[k]<-N1*mean(y1aug[s1])
  s2<-sample(N2,n2)
  tauhat2[k]<-N2*mean(y2aug[s2])
  s3<-sample(N3,n3)
  tauhat3[k]<-N3*mean(y3aug[s3])
  tauhat[k]<-tauhat1[k]+tauhat2[k]+tauhat3[k]
}
par(mfrow=c(1,1))
hist(tauhat,nclass=30, col=rgb(1,1,0,0.2),prob=T)
abline(v=tau,col="red",lwd=2)

#Bias, variance and MSE of tau_hat
(bias=mean(tauhat)-tau)
(var_tau_hat=var(tauhat))
var(tauhat1)+var(tauhat2)+var(tauhat3)
(MSE_tau_hat=var_tau_hat+bias^2)