#run a simulation to study the effect of sample size and the margin of error: 
#verification of the formula for the optimal sample size
setwd("C:/Users/Shirin/Desktop/STUFF/STAT-410")
clothing<-read.table(file="Clothing.csv",header=T,sep = ",") 
yp<-clothing$sales#population
mu<-mean(yp) #population mean
sig2<-var(yp) #population variance
N<-length(yp) #population size
d<-1000 #margin of error
n<-10 #sample size
b<-10000 #simulation size
simmean<-function(yp,n,b) {
  N<-length(yp)
  sig2<-var(yp)
  ybar<-c()
  for (i in 1:b) {
    s<-sample(1:N,n)
    ys<-yp[s]
    ybar[i]<-mean(ys)
  }
  out<-ybar
  return(out)
}

n<-c(10,30,50,100)
ybar1<-simmean(yp,n[1],b)
ybar2<-simmean(yp,n[2],b)
ybar3<-simmean(yp,n[3],b)
ybar4<-simmean(yp,n[4],b)

par(mfrow=c(2,2))
hist(ybar1)
abline(v=mu,col="red",lwd=2)
abline(v=mu-d,col="blue",lwd=2)
abline(v=mu+d,col="blue",lwd=2)
hist(ybar2)
abline(v=mu,col="red",lwd=2)
abline(v=mu-d,col="blue",lwd=2)
abline(v=mu+d,col="blue",lwd=2)
hist(ybar3)
abline(v=mu,col="red",lwd=2)
abline(v=mu-d,col="blue",lwd=2)
abline(v=mu+d,col="blue",lwd=2)
hist(ybar4)
abline(v=mu,col="red",lwd=2)
abline(v=mu-d,col="blue",lwd=2)
abline(v=mu+d,col="blue",lwd=2)


length(ybar1[abs(ybar1-mu)>d])/b
length(ybar2[abs(ybar2-mu)>d])/b
length(ybar3[abs(ybar3-mu)>d])/b
length(ybar4[abs(ybar4-mu)>d])/b

2*(1-pnorm(sqrt(n[1]/((1-n[4]/N)*sig2))*d))
2*(1-pnorm(sqrt(n[2]/((1-n[4]/N)*sig2))*d))
2*(1-pnorm(sqrt(n[3]/((1-n[4]/N)*sig2))*d))
2*(1-pnorm(sqrt(n[4]/((1-n[4]/N)*sig2))*d))




#Estimate the proportion of males among the survivors of titanic
setwd("C:/Users/Shirin/Desktop/STUFF/STAT-410")
titanic<-read.table(file="titanic.csv",header=T,sep = ",") 
yp<-titanic$sex
N<-length(yp)
p<-mean(yp)#true proportion of males
n<-50
ys<-sample(yp,n)#sample 50 survivors
phat<-mean(ys)#proportion of females in the sample

#now let's study the sampling distribution of our estimator phat (the sample mean)
#let's repeat the experiment, i.e., selecting n survivors from the population and counting the number
#males over and over again for b=10000 times
b<-10000
phat<-c()
for (i in 1:b){
  ys<-sample(yp,n)
  phat[i]<-mean(ys)
}
#different ways of assessing performance and qualities of an estimator and its sampling distribtion

hist(phat,prob=TRUE)
abline(v=p,col="red")
phat1<-(phat-mean(phat))/sqrt(var(phat))#normalization
z<-seq(-4,4,length=100)
hist(phat1,prob=TRUE)
lines(z,dnorm(z),col="red")
qqnorm(phat1)
lines(z,z,col="red")
plot(ecdf(phat1))
lines(z,pnorm(z),col="red")
MSE<-mean((phat-p)^2)
var(phat)
bias<-mean(phat)-p#estimated bias, note that we know that bias=0 for the sample mean but this
#small amount is due to simulation error


#artificial population with small p; let say a bag of balls with very few black balls
#and the rest are all white, want to estimate p=proportion of black balls 
N<-100
yp<-c(rep(1,50),rep(0,50)) #only 5 balls among 100 are black
p<-mean(yp)

n<-10
ys<-sample(yp,n)#sample 50 survivors
phat<-mean(ys)#proportion of females in the sample

#now let's study the sampling distribution of our estimator phat (the sample mean)
#let's repeat the experiment, i.e., selecting n survivors from the population and counting the number
#males over and over again for b=10000 times
b<-10000
phat<-c()
for (i in 1:b){
  ys<-sample(yp,n)
  phat[i]<-mean(ys)
}
#different ways of assessing performance and qualities of an estimator and its sampling distribtion

hist(phat,prob=TRUE)
abline(v=p,col="red")
phat1<-(phat-mean(phat))/sqrt(var(phat))#normalization
z<-seq(-4,4,length=100)
hist(phat1,prob=TRUE)
lines(z,dnorm(z),col="red")
qqnorm(phat1)
lines(z,z,col="red")
plot(ecdf(phat1))
lines(z,pnorm(z),col="red")
MSE<-mean((phat-p)^2)
var(phat)
bias<-mean(phat)-p#estimated bias, note that we know that bias=0 for the sample mean but this
#small amount is due to simulation error