#back to the example that we have in last simulation study: #Example 1. #consider the data in form of a map of a region tht is divided into farms of different arreas #We are going to estimate the number of trees in the region by selecting n=5 farms and #estimating the number of trees in them. The area of the farm land will be used as an #auxiliary variable to estimate the unequal probabilities z_pop<-c(9,7,1,16,7,5,7,0,7,6,18,5,3,9)#unknown population values y=z_pop tau<-sum(y)#true value of the parameter we want to estimate x<-c(.08,.08,.07,.15,.09,.03,.06,.02,.04,.04,.16,.09,.02,.07)#known auxiliary variable such as #area of the farm p<-x/sum(x)#obtain unequal probabilities n<-10#sample size N<-length(y)#population size SRS<-function(y,n,b){ N<-length(y) ybar<-rep(NA,b) varhat<-rep(NA,b) for (i in 1:b){ s<-sample(1:N,n) ybar[i]<-mean(y[s]) varhat[i]<-((N-n)/N)*var(y[s])/n } out<-list(ybar,varhat) return(out) } b<-10000 srs<-SRS(y,10,b) hist(srs[[1]],prob=TRUE, main='Histogram of the sample mean') points(mean(y),0,pch=17,col='red') mean(srs[[1]]) #true mean (mu_y=mean(y)) #Obtain population ratio R = sum(y)/sum(x)=tau_y/tau_x #r_s=mean(y_s)/mean(x_s) eq 7.1 R_pop=sum(y)/sum(x) #obtain sample ratio r eq. 7.2 page 92 s=sample(1:length(y),10) y_s=y[s] x_s=x[s] r_s=mean(y[s])/mean(x[s]) #take a look if r is an unbiased estimator of R #unbiased? R_pop r_s #the ratio estimate of the population mean is: mu_r_hat=r*mu_x (mu_hat_r=r_s*mean(x)) mean(y) #Now lets obtain ratio estimator od mu_x mu_hat_r<-function(y,x,n,b){ N<-length(y) muhatr<-rep(NA,b) varhatmuhatr<-rep(NA,b) for (i in 1:b){ s<-sample(1:N,n) y_s=y[s] x_s=x[s] r_s=mean(y_s)/mean(x_s) muhatr[i]<-r_s*mean(x) #eq 7.3 and 7.2 varhatmuhatr[i]<-((N-n)/N)*(1/((n-1)*n))*sum((y_s-r_s*x_s)^2) #eq 7.5 and 7.6 } out<-list(muhatr,varhatmuhatr) return(out) } b<-10 mu_hat_r_vec<-mu_hat_r(y,x,10,b)[[1]] hist(mu_hat_r_vec,prob=TRUE, main='Histogram of the mu_hat_r',nclass=20) points(mean(y),0,pch=17,col='red') #lets take a look into the sampling distributions of the #sample mean and ratio estimator b<-10000 run1<-mu_hat_r(y,x,10,b) mu_hat_r_vec<-run1[[1]] hist(mu_hat_r_vec,prob=TRUE,xlim=range(srs[[1]]),col=rgb(0,1,1,0.5), main='Histograms of the mu_hat_r and sample mean',nclass=30) points(mean(y),0,pch=17,col='red') hist(srs[[1]],col=rgb(1,0,0,0.3),add=T,prob=T,nclass=30) #by looking at the plots, which one of the estimators has smaller varance? #by looking at the plots, is there any obvious indication that any of the estimators is biased? #lets look at the actual numbers: #1. bias #bias of the sample mean mean(srs[[1]])-mean(y) #bias of the ratio estimate of the population mean mean(mu_hat_r_vec)-mean(y) #2. Lets take a look in variances of the both estimators: #which estimator has smaller variance? print(paste("Variance of the sample mean estimator:",mean(srs[[2]]),sep='')) print(paste("Variance of the ratio estimate of mu_x:",mean(run1[[2]]),sep='')) #3. Calculate MSE for both estimators #mu_hat_r is not unbiased, so it is useful to study the MSE to compare it to #the sample mean #MSE(mu_hat_r)=var(estimator)+bias(estimator)^2 (MSE_mu_hat_r=var(run1[[1]])+(mean(run1[[1]])-mean(y))^2) (MSE_sample_mean=var(srs[[1]])+(mean(srs[[1]])-mean(y))^2) #which estimator has smaller MSE? ####################################################################### #note: var(run1[[1]]used to obatin MSE(mu_hat_r) is the emirical variance #of the mu_hat_r estimator #this should be approximatelly equal to the var(mu_hat_r) obtained from the #formula 7.4 page 94 var(run1[[1]]) (var_mu_hat_r=((N-n)/N)*(1/((N-1)*n))*sum((y-R_pop*x)^2)) ####################################################################### #Example 2. #Model based inference #we will use the same data for x, but we will simulate the unknown #popluation values {Y1,...YN} from the model Y ~ N(beta*xi,xi*1) #i.e. Y=beta*xi+eps_i where eps_i ~ N(0,xi*1) #We will perform simulation study to obtain sampling distribution #of tau_hat #tau_hat is model-unbiased estimator of tau under the assumption #that the relationship between yi and xi is linear and the line #pass through the origin. So we are going to prove that E(tau_hat)=E(tau) #Note: tau is now a vector since we re going to sample #different realizations from the population vector Y=(Y1,..,YN) #on each iteration of our simulation #the model that we are simulating from is given on the page 107 #where is fixes elements of the variance of Yi to vi=xi and sigma_R^2 #the general formula for the varance(Yi)=vi*sigma_R^2 is given on a #page 106 modelBasedEst=function(Y_sd,beta,x,n,b){ #simulate population from the model hat_tau=tau=err=r=rep(NA,b) for (i in (1:b)){ Y=beta*x+rnorm(N,0,Y_sd) tau[i]=sum(Y) s=sample(1:N,n) r[i]=(mean(Y[s])/mean(x[s])) hat_tau[i]=r[i]*sum(x) # page 107 equation on tau_hat err[i]=((hat_tau[i]-tau[i])^2) #mean squared error } a<-print.noquote(paste("tau hat:","E(tau_hat)=",mean(hat_tau))) out=list(tau=tau,r=r,hat_tau=hat_tau,err=err) return(out) } #to simulate data for the model Y ~ N(beta*xi,xi*1), we are fixing #beta=0.8 run1<-modelBasedEst(Y_sd=x,beta=0.8,x=x,n=10,b=10000) #E(tau_hat)=E(tau) - so tau_hat is model-unbiased estimator of tau (tau=mean(run1$tau)) (tau_hat=mean(run1$hat_tau)) #MSE is a mean from the vector err obtained as MSE of tau_hat for #each of the b samples (MSE=mean(run1$err)) #lets take a look into a histogram of the tau_hat hist(run1$hat_tau,prob=TRUE,col=rgb(0,1,1,0.5), xlab='tau_hat',main='Histograms of the tau_hat',nclass=30) points(tau,0,pch=17,col='red') #under the model, where the var(Yi)=xi*1, the best linear estimator is #beta=sum(y[s])/sum(x[s])=r #lets take a look into the mean of the sampling distribution of r #obtained from b samples in our simulation and see how close it is # to the true value of beta, which is parameter of the model #where b realizations of the population were simulated from. (beta=0.8) mean(run1$r)