#back to the example that we have in last simulation study:
#Example 1.
#consider the data in form of a map of a region tht is divided into farms of different arreas
#We are going to estimate the number of trees in the region by selecting n=5 farms and 
#estimating the number of trees in them. The area of the farm land will be used as an 
#auxiliary variable to estimate the unequal probabilities

z_pop<-c(9,7,1,16,7,5,7,0,7,6,18,5,3,9)#unknown population values
y=z_pop
tau<-sum(y)#true value of the parameter we want to estimate
x<-c(.08,.08,.07,.15,.09,.03,.06,.02,.04,.04,.16,.09,.02,.07)#known auxiliary variable such as
#area of the farm
p<-x/sum(x)#obtain unequal probabilities
n<-10#sample size
N<-length(y)#population size




SRS<-function(y,n,b){
  N<-length(y)
  ybar<-rep(NA,b)
  varhat<-rep(NA,b)
  for (i in 1:b){
    s<-sample(1:N,n)
    ybar[i]<-mean(y[s])
    varhat[i]<-((N-n)/N)*var(y[s])/n
  }
  out<-list(ybar,varhat)
  return(out)
}


b<-10000
srs<-SRS(y,10,b)
hist(srs[[1]],prob=TRUE, main='Histogram of the sample mean')
points(mean(y),0,pch=17,col='red')


mean(srs[[1]])
#true mean
(mu_y=mean(y))




#Obtain population ratio R = sum(y)/sum(x)=tau_y/tau_x
#r_s=mean(y_s)/mean(x_s) eq 7.1 
R_pop=sum(y)/sum(x)
#obtain sample ratio r eq. 7.2 page 92
s=sample(1:length(y),10)
y_s=y[s]
x_s=x[s]

r_s=mean(y[s])/mean(x[s])

#take a look if r is an unbiased estimator of R
#unbiased?
R_pop
r_s


#the ratio estimate of the population mean is:  mu_r_hat=r*mu_x
(mu_hat_r=r_s*mean(x))
mean(y)


#Now lets obtain ratio estimator od mu_x
mu_hat_r<-function(y,x,n,b){
              N<-length(y)
              muhatr<-rep(NA,b)
              varhatmuhatr<-rep(NA,b)
              for (i in 1:b){
                s<-sample(1:N,n)
                y_s=y[s]
                x_s=x[s]
                r_s=mean(y_s)/mean(x_s)
                muhatr[i]<-r_s*mean(x)  #eq 7.3 and 7.2
                varhatmuhatr[i]<-((N-n)/N)*(1/((n-1)*n))*sum((y_s-r_s*x_s)^2) #eq 7.5 and 7.6
              }
              out<-list(muhatr,varhatmuhatr)
  return(out)
}



b<-10
mu_hat_r_vec<-mu_hat_r(y,x,10,b)[[1]]
hist(mu_hat_r_vec,prob=TRUE, main='Histogram of the mu_hat_r',nclass=20)
points(mean(y),0,pch=17,col='red')


#lets take a look into the sampling distributions of the
#sample mean and ratio estimator 
b<-10000
run1<-mu_hat_r(y,x,10,b)
mu_hat_r_vec<-run1[[1]]
hist(mu_hat_r_vec,prob=TRUE,xlim=range(srs[[1]]),col=rgb(0,1,1,0.5), main='Histograms of the mu_hat_r and sample mean',nclass=30)
points(mean(y),0,pch=17,col='red')
hist(srs[[1]],col=rgb(1,0,0,0.3),add=T,prob=T,nclass=30)


#by looking at the plots, which one of the estimators has smaller varance?
#by looking at the plots, is there any obvious indication that any of the estimators is biased?

#lets look at the actual numbers:
#1. bias
#bias of the sample mean
mean(srs[[1]])-mean(y)
#bias of the ratio estimate of the population mean
mean(mu_hat_r_vec)-mean(y)
#2. Lets take a look in variances of the both estimators:
#which estimator has smaller variance?
print(paste("Variance of the sample mean estimator:",mean(srs[[2]]),sep=''))
print(paste("Variance of the ratio estimate of mu_x:",mean(run1[[2]]),sep=''))

#3. Calculate MSE for both estimators
#mu_hat_r is not unbiased, so it is useful to study the MSE to compare it to 
#the sample mean
#MSE(mu_hat_r)=var(estimator)+bias(estimator)^2

(MSE_mu_hat_r=var(run1[[1]])+(mean(run1[[1]])-mean(y))^2)
(MSE_sample_mean=var(srs[[1]])+(mean(srs[[1]])-mean(y))^2)
#which estimator has smaller MSE?

#######################################################################
#note: var(run1[[1]]used to obatin MSE(mu_hat_r) is the emirical variance 
#of the mu_hat_r estimator
#this should be approximatelly equal to the var(mu_hat_r) obtained from the
#formula 7.4 page 94
var(run1[[1]])
(var_mu_hat_r=((N-n)/N)*(1/((N-1)*n))*sum((y-R_pop*x)^2))
#######################################################################



#Example 2.
#Model based inference
#we will use the same data for x, but we will simulate the unknown
#popluation values {Y1,...YN} from the model Y ~ N(beta*xi,xi*1)
#i.e. Y=beta*xi+eps_i where eps_i ~ N(0,xi*1)
#We will perform simulation study to obtain sampling distribution 
#of tau_hat 
#tau_hat is model-unbiased estimator of tau under the assumption
#that the relationship between yi and xi is linear and the line
#pass through the origin. So we are going to prove that E(tau_hat)=E(tau)
#Note: tau is now a vector since we re going to sample
#different realizations from the population vector Y=(Y1,..,YN)
#on each iteration of our simulation

#the model that we are simulating from is given on the page 107
#where is fixes elements of the variance of Yi to vi=xi and sigma_R^2
#the general formula for the varance(Yi)=vi*sigma_R^2 is given on a 
#page 106

modelBasedEst=function(Y_sd,beta,x,n,b){
     #simulate population from the model
     hat_tau=tau=err=r=rep(NA,b)
     for (i in (1:b)){  
     Y=beta*x+rnorm(N,0,Y_sd)
     tau[i]=sum(Y)
     s=sample(1:N,n)
     r[i]=(mean(Y[s])/mean(x[s]))
     hat_tau[i]=r[i]*sum(x)      # page 107 equation on tau_hat
     err[i]=((hat_tau[i]-tau[i])^2) #mean squared error
     }
     a<-print.noquote(paste("tau hat:","E(tau_hat)=",mean(hat_tau)))
     out=list(tau=tau,r=r,hat_tau=hat_tau,err=err)
  return(out)
}
        
#to simulate data for the model Y ~ N(beta*xi,xi*1), we are fixing 
#beta=0.8
run1<-modelBasedEst(Y_sd=x,beta=0.8,x=x,n=10,b=10000)

#E(tau_hat)=E(tau) - so tau_hat is model-unbiased estimator of tau
(tau=mean(run1$tau))
(tau_hat=mean(run1$hat_tau))

#MSE is a mean from the vector err obtained as MSE of tau_hat for 
#each of the b samples
(MSE=mean(run1$err))

#lets take a look into a histogram of the tau_hat
hist(run1$hat_tau,prob=TRUE,col=rgb(0,1,1,0.5), xlab='tau_hat',main='Histograms of the tau_hat',nclass=30)
points(tau,0,pch=17,col='red')


#under the model, where the var(Yi)=xi*1, the best linear estimator is
#beta=sum(y[s])/sum(x[s])=r
#lets take a look into the mean of the sampling distribution of r
#obtained from b samples in our simulation and see how close it is
# to the true value of beta, which is parameter of the model 
#where b realizations of the population were simulated from.
(beta=0.8)
mean(run1$r)