#run a simulation to study the effect of sample size and the margin of error: #verification of the formula for the optimal sample size setwd("C:/Users/Shirin/Desktop/STUFF/STAT-410") clothing<-read.table(file="Clothing.csv",header=T,sep = ",") yp<-clothing$sales#population mu<-mean(yp) #population mean sig2<-var(yp) #population variance N<-length(yp) #population size d<-1000 #margin of error n<-10 #sample size b<-10000 #simulation size simmean<-function(yp,n,b) { N<-length(yp) sig2<-var(yp) ybar<-c() for (i in 1:b) { s<-sample(1:N,n) ys<-yp[s] ybar[i]<-mean(ys) } out<-ybar return(out) } n<-c(10,30,50,100) ybar1<-simmean(yp,n[1],b) ybar2<-simmean(yp,n[2],b) ybar3<-simmean(yp,n[3],b) ybar4<-simmean(yp,n[4],b) par(mfrow=c(2,2)) hist(ybar1) abline(v=mu,col="red",lwd=2) abline(v=mu-d,col="blue",lwd=2) abline(v=mu+d,col="blue",lwd=2) hist(ybar2) abline(v=mu,col="red",lwd=2) abline(v=mu-d,col="blue",lwd=2) abline(v=mu+d,col="blue",lwd=2) hist(ybar3) abline(v=mu,col="red",lwd=2) abline(v=mu-d,col="blue",lwd=2) abline(v=mu+d,col="blue",lwd=2) hist(ybar4) abline(v=mu,col="red",lwd=2) abline(v=mu-d,col="blue",lwd=2) abline(v=mu+d,col="blue",lwd=2) length(ybar1[abs(ybar1-mu)>d])/b length(ybar2[abs(ybar2-mu)>d])/b length(ybar3[abs(ybar3-mu)>d])/b length(ybar4[abs(ybar4-mu)>d])/b 2*(1-pnorm(sqrt(n[1]/((1-n[4]/N)*sig2))*d)) 2*(1-pnorm(sqrt(n[2]/((1-n[4]/N)*sig2))*d)) 2*(1-pnorm(sqrt(n[3]/((1-n[4]/N)*sig2))*d)) 2*(1-pnorm(sqrt(n[4]/((1-n[4]/N)*sig2))*d)) #Estimate the proportion of males among the survivors of titanic setwd("C:/Users/Shirin/Desktop/STUFF/STAT-410") titanic<-read.table(file="titanic.csv",header=T,sep = ",") yp<-titanic$sex N<-length(yp) p<-mean(yp)#true proportion of males n<-50 ys<-sample(yp,n)#sample 50 survivors phat<-mean(ys)#proportion of females in the sample #now let's study the sampling distribution of our estimator phat (the sample mean) #let's repeat the experiment, i.e., selecting n survivors from the population and counting the number #males over and over again for b=10000 times b<-10000 phat<-c() for (i in 1:b){ ys<-sample(yp,n) phat[i]<-mean(ys) } #different ways of assessing performance and qualities of an estimator and its sampling distribtion hist(phat,prob=TRUE) abline(v=p,col="red") phat1<-(phat-mean(phat))/sqrt(var(phat))#normalization z<-seq(-4,4,length=100) hist(phat1,prob=TRUE) lines(z,dnorm(z),col="red") qqnorm(phat1) lines(z,z,col="red") plot(ecdf(phat1)) lines(z,pnorm(z),col="red") MSE<-mean((phat-p)^2) var(phat) bias<-mean(phat)-p#estimated bias, note that we know that bias=0 for the sample mean but this #small amount is due to simulation error #artificial population with small p; let say a bag of balls with very few black balls #and the rest are all white, want to estimate p=proportion of black balls N<-100 yp<-c(rep(1,50),rep(0,50)) #only 5 balls among 100 are black p<-mean(yp) n<-10 ys<-sample(yp,n)#sample 50 survivors phat<-mean(ys)#proportion of females in the sample #now let's study the sampling distribution of our estimator phat (the sample mean) #let's repeat the experiment, i.e., selecting n survivors from the population and counting the number #males over and over again for b=10000 times b<-10000 phat<-c() for (i in 1:b){ ys<-sample(yp,n) phat[i]<-mean(ys) } #different ways of assessing performance and qualities of an estimator and its sampling distribtion hist(phat,prob=TRUE) abline(v=p,col="red") phat1<-(phat-mean(phat))/sqrt(var(phat))#normalization z<-seq(-4,4,length=100) hist(phat1,prob=TRUE) lines(z,dnorm(z),col="red") qqnorm(phat1) lines(z,z,col="red") plot(ecdf(phat1)) lines(z,pnorm(z),col="red") MSE<-mean((phat-p)^2) var(phat) bias<-mean(phat)-p#estimated bias, note that we know that bias=0 for the sample mean but this #small amount is due to simulation error