#Example 1:

#consider the data in form of a map of a region tht is divided into farms of different arreas
#We are going to estimate the number of trees in the region by selecting n=5 farms and 
#estimating the number of trees in them. The area of the farm land will be used as an 
#auxiliary variable to estimate the unequal probabilities


z_pop<-c(9,7,1,16,7,5,7,0,7,6,18,5,3,9)#unknown population values
tau<-sum(z_pop)#true value of the parameter we want to estimate
x<-c(.08,.08,.07,.15,.09,.03,.06,.02,.04,.04,.16,.09,.02,.07)#known auxiliary variable such as
#area of the farm
p<-x/sum(x)#obtain unequal probabilities
n<-5#sample size
N<-length(z_pop)#population size

#lets see what will be the sample total popoulation estimate
#if we ignore the size of the farms
#and take SRS
#sample mean to estimate the population total

(s<-sample(1:N,n))
smpl<-z_pop[s]
(tau_hat<-N*mean(smpl))
varhat<-N*(N-n)*var(smpl)/n   #using formula 2.9 remember tau_hat=N*y_bar

#now take into account the size of the farms, i.e, take a sample using the unequal 
#probabilities and calculate the Hansen-Hurvitz estimator

p<-x/sum(x)
(smpl<-sample(1:N,n,replace=TRUE,prob=p))
y_s<-z_pop[smpl]
p_s<-p[smpl]
(tauhat.hh<-mean(y_s/p_s))
varhat.hh<-var(y_s/p_s)/n

#let's calculate the Hurvitz-Thompson estimator
#obtain the inclusion probabilities
pi<-1-(1-p)^n
unique_s<-unique(smpl)
y_s<-z_pop[unique_s]
nu<-length(unique_s)
pi_s<-pi[unique_s]

tauhat.ht<-sum(y_s/pi_s)


#Now obtain variance of the tauhat.ht

#First obtain the joint inclusion prob
#pii=pi on diagonal
#pij=pi+pj-(1-(1-pi-pj)^n), off-dagonal

t<-array(dim=c(N,N))
pij=matrix(NA,N,N)
for (i in 1:N) {
  for (j in 1:N) {
    if (i==j) pij[i,j]<-pi[i]
    if (i!=j) pij[i,j]<-pi[i]+pi[j]-(1-(1-p[i]-p[j])^n)
  }
}

#calculate the variance of HT estimator using formula (6.5) page 70
M<-pij
term1<-sum(((1-pi)/pi)*z_pop^2)
k<-0
term2<-c()
for (i in 1:N) {
  for (j in 1:N) {
    if (j!=i) {
      k<-k+1
      term2[k]<-((M[i,j]-M[i,i]*M[j,j])/(M[i,i]*M[j,j]))*z_pop[i]*z_pop[j]
    }
  }
}
var.ht<-term1+sum(term2)


#next, calculate the estimate of variance of HT estimator using formula (6.6) page 70
(G<-pij[unique_s,unique_s])
term1<-sum(((1-pi_s)/pi_s^2)*y_s^2)
k<-0
term2<-c()
for (i in 1:nu) {
  for (j in 1:nu) {
    if (j!=i) {
      k<-k+1
      term2[k]<-((G[i,j]-G[i,i]*G[j,j])/(G[i,i]*G[j,j]))*y_s[i]*y_s[j]/G[i,j]
    }
  }
}
varhat.ht<-term1+sum(term2)

#calculate the approximation to the estimate of the variance of HT estimator 
#using formula (6.8) page 70
t<-nu*y_s/pi_s
tHT=mean(t)
vartild.ht<-((N-nu)/N)*var(t)/nu



#Now we can run some simulations
HH<-function(y,n,b,p){
  N<-length(y)
  tauhh<-rep(NA,b)
  for (i in 1:b){
    s<-sample(1:N,n,replace=TRUE,prob=p)
    tauhh[i]<-mean(y[s]/p[s])
  }
  return(tauhh)
}
b<-10000
thh<-HH(z_pop,n,b,p)
hist(thh)
points(tau,0,pch=17, col='red')

#Horwitz - Thompson
HT<-function(y,n,b,p){
  N         = length(y)
  pi        = 1-(1-p)^n
  tauht     = rep(NA,b)
  var_tauht = rep(NA,b)
  for (i in 1:b){
    smpl         = sample(1:N,n,replace=TRUE,prob=pi)
    s            = unique(smpl)
    nu           = length(s)
    #approximate tauHT estimator page 70
    t            = nu*y[s]/pi[s]
    tauht[i]     = mean(t)
    var_tauht[i] = ((N-nu)/N)*var(t)/nu #variance of the tauHT formula (6.8) page 70
  }
  return(list(tauht=tauht,var_tauht=var_tauht))
}

tht<-HT(z_pop,n,b,p)
#mean(tht$tauht)
hist(tht$tauht)
points(tau,0,pch=17,col='red')




#Next function calculates biased HH. What is making this HH estimator biased?

HHbiased<-function(y,n,b,p){
  N<-length(y)
  tau_hat<-rep(NA,b)
  for (i in 1:b){
    s<-sample(1:N,n,replace=TRUE,prob=p)
    s<-unique(s)
    tau_hat[i]<-mean(y[s]/p[s])
  }
  out<-tau_hat
  return(out)
}

hhb<-HHbiased(z_pop,5,b,p)
hist(hhb,col="cyan",prob=TRUE)
points(tau,0,pch=17,col='red')