#Tutorial 4 #Example 1 #Study the sample size and margin of error #run a simulation to study the effect of sample size and the margin of error: #verification of the formula for the optimal sample size clothing<-read.table(file="Clothing.csv",header=T,sep = ",") head(clothing) zpop<-clothing$sales#population mu<-mean(zpop) #population mean sigma2<-var(zpop) #population variance N<-length(zpop) #population size d<-1000 #margin of error n<-10 #sample size b<-10000 #simulation size simulate_mean<-function(zpop,n,b) { N<-length(zpop) sigma2<-var(zpop) ybar<-rep(NA,b) for (i in 1:b) { s<-sample(1:N,n) ybar[i]<-mean(zpop[s]) } out<-ybar return(out) } n<-c(10,30,50,100) ybar=list() par(mfrow=c(2,2)) for (j in (1:length(n))){ ybar[[j]]<-simulate_mean(zpop,n[j],b) hist(ybar[[j]],main=n[j],xlab='ybar') abline(v=mu,col="red",lwd=2) abline(v=mu-d,col="blue",lwd=2) abline(v=mu+d,col="blue",lwd=2) } #The first column contains empirically calculated #probability of error i.e. prob( abs(ybar-mu)>d) equal 1-alpha mat=matrix(NA,length(n),2) for (j in (1:length(n))){ mat[j,1]= length(ybar[[j]][abs(ybar[[j]]-mu)>d])/b mat[j,2]=2*(1-pnorm(sqrt(n[j]/((1-n[j]/N)*sigma2))*d)) } #observe how probability of the error diminishes with increase of the sample size rownames(mat)=n #Example 2 #Estimate the proportion of males among the survivors of titanic titanic<-read.table(file="titanic.csv",header=T,sep = ",") zpop_aux<-titanic$Sex zpop=rep(0,length(zpop_aux)) zpop[which(zpop_aux=='Male')]=1 N<-length(zpop) p_true <- mean(zpop)#true proportion of males n <- 15 y<-sample(zpop,n)#sample 15 survivors phat<-mean(y)#proportion of males in the sample #now let's study the sampling distribution of our estimator phat (the sample mean) #let's repeat the experiment, i.e., selecting n survivors from the population and counting the number #males over and over again for b=10000 times b<-10000 phat<-rep(NA,b) for (i in 1:b){ s<-sample(zpop,n) phat[i]<-mean(s) } #different ways of assessing performance and qualities of an estimator and its sampling distribtion par(mfrow=c(1,1)) hist(phat,prob=TRUE) abline(v=p_true,col="red") phat1<-(phat-mean(phat))/sqrt(var(phat))#normalization so that it follows N(0,1) z<-seq(-4,4,length=100) hist(phat1,prob=TRUE) lines(z,dnorm(z),col="red") qqnorm(phat1) lines(z,z,col="red") plot(ecdf(phat1)) lines(z,pnorm(z),col="red") MSE<-mean((phat-p_true)^2) var(phat) bias<-mean(phat)-p_true#estimated bias, note that we know that bias=0 for the sample mean but this #small amount is due to simulation error #artificial population with small p; let say a bag of balls with very few black balls #and the rest are all white, want to estimate p=proportion of black balls N<-100 zpop<-c(rep(1,5),rep(0,95)) # 5 balls among 100 are black p<-mean(zpop) n<-10 #set.seed(2255) s<-sample(zpop,n)#sample 50 survivors phat<-mean(s)#proportion of females in the sample #now let's study the sampling distribution of our estimator phat (the sample mean) #let's repeat the experiment, i.e., selecting n survivors from the population and counting the number #males over and over again for b=10000 times b<-10000 phat<-rep(NA,b) for (i in 1:b){ s<-sample(zpop,n) phat[i]<-mean(s) } #different ways of assessing performance and qualities of an estimator and its sampling distribtion hist(phat,prob=TRUE) abline(v=p,col="red") phat1<-(phat-mean(phat))/sqrt(var(phat))#normalization z<-seq(-4,4,length=100) hist(phat1,prob=TRUE) lines(z,dnorm(z),col="red") qqnorm(phat1) lines(z,z,col="red") plot(ecdf(phat1)) lines(z,pnorm(z),col="red") MSE<-mean((phat-p)^2) #estimate variance of the proportion estimator emprically var(phat) #now use the formula var(ybar) mean(((N-n)/(N*(n-1)))*p*(1-p)) bias<-mean(phat)-p#estimated bias, note that we know that bias=0 for the sample mean but this #small amount is due to simulation error #note that if p_hat=0.5 then the sample variance is at its maximum #the sample variance is : s^2=n/(n-1)*p_hat*(1-p_hat) pgrid=seq(0,1,length=100000) sample_var=rep(NA,length=100000) for (i in (1:100000)){ sample_var[i]=n/(n-1)*pgrid[i]*(1-pgrid[i]) } plot(pgrid,sample_var) abline(v=0.5,col='red') #Example 3: Samling designs with unequal probabilities #SRS with replacement unequal probabilities #a). Hansen-Hurwitz estimator #b). Hurwitz-Thompson estimator SRS<-function(zpop,n,b){ N<-length(zpop) ybar<-rep(NA,b) varhat<-rep(NA,b) for (i in 1:b){ s<-sample(1:N,n,replace=T) ybar[i]<-mean(zpop[s]) varhat[i]<-((N-n)/N)*var(zpop[s])/n } out<-list(ybar,varhat) return(out) } zpop<-trees$Volume mu<-mean(zpop) p<-trees$Girth b<-10000 srs<-SRS(zpop,10,b) par(mfrow=c(1,1)) hist(srs[[1]],prob=TRUE,ylim=c(0,.15)) abline(v=mu,col='red') #a). Hansen-Hurwitz estimator HH<-function(zpop,n,b,p){ N<-length(zpop) ybarhh<-rep(NA,b) ybar<-rep(NA,b) for (i in 1:b){ s<-sample(1:N,n,replace=TRUE,prob=p) ybar[i]<-mean(zpop[s]) ybarhh[i]<-mean(zpop[s]/p[s])/N } out<-list(ybar,ybarhh) return(out) } #calculate probabilities p<-p/sum(p) hh<-HH(zpop,10,b,p) #create a histogram of the HH estimator distribution hist(hh[[2]],col="yellow",prob=TRUE,xlim=c(min(hh[[1]],hh[[2]]),max(hh[[1]],hh[[2]]))) #create a histogram of the simple mean under SRS with replcament and different inclussion probabilities hist(hh[[1]],add=TRUE,col="lightblue",prob=TRUE) points(mu,0,pch=17,col='red') #Note that mean estimator is biased, HH is adjusting the mean estimator for the #inclusion probabilities legend(40,.12,legend=c("HH","sample mean"),fill=c("yellow","lightblue")) #HT estimator of the population total is unbiased under SRS with or without replacement #HT=sum(yi/pi) where yi are the sample points and pi are the inclusion #probabilities for each data point HT<-function(zpop,n,b,p){ N<-length(zpop) #calculate the inclusion probabilities pi pi<-1-(1-p)^n ybarht<-rep(NA,b) for (i in 1:b){ s<-sample(1:N,n,replace=TRUE,prob=p) #see how many distinct samples are there since we have SRS with replacement s<-unique(s) #obtain the HT only from the distinct units t<-zpop[s]/pi[s] ybarht[i]<-sum(t)/N #sum(t) is HT of the population total #to estimate ybar we devide tau_hat/N since by definition tau=N*ybar } return(ybarht) } ht<-HT(zpop,10,b,p) hist(ht,col="lightgreen",prob=TRUE) points(mu,0,pch=17,col='red') #Note that HT is also inbiased estimator