#Tutorial-2

#Q: Why is var(ybar)=(N-n)sigma2/nN instead of var(ybar)=sigma2/n as it was in the math-stat class?!

#We will study the variance of the sample mean as:
#1. Population size, N, gets large
#2. In connection with sampling with replacement
#3. In the model-based approach

n<-10
Nvec<-c(50,100,200,300,400,500)
Ypop<-list() 
sig2<-c()
for (i in 1:6) {
  Ypop[[i]]<-rnorm(Nvec[i])
  sig2[i]<-var(Ypop[[i]])
}

b<-10000
varhatb<-c()
for (i in 1:6) {
  ybar<-c()
  for (j in 1:b) {
    s<-sample(Ypop[[i]],n,replace=F)
    ybar[j]<-mean(s)
  }
  varhatb[i]<-var(ybar)
}
plot(Nvec,(varhatb-sig2/n)^2,ylab="e",xlab="N")

#Now we will do the same, but take another sample each time using sampling with replacement


Nvec<-c(50,100,500,1000,5000)
nvec<-c(10,30,50,100,500)
Ypop<-list() 
sig2<-c()
for (i in 1:5) {
  Ypop[[i]]<-rnorm(Nvec[i])
  sig2[i]<-var(Ypop[[i]])
}


b<-10000
varhatb1<-c()
varhatb2<-c()
for (i in 1:5) {
  ybar1<-c()
  ybar2<-c()
  for (j in 1:b) {
    s1<-sample(Ypop[[i]],nvec[i],replace=F)
    ybar1[j]<-mean(s1)
    s2<-sample(Ypop[[i]],nvec[i],replace=T)
    ybar2[j]<-mean(s2)
  }
  varhatb1[i]<-var(ybar1)
  varhatb2[i]<-var(ybar2)
}
plot(varhatb2,pch=16,col="red",type="l",ylab="var")
lines(varhatb1,pch=16,col="blue")
legend(3.5,max(varhatb2),legend=c("SRS-WR","SRS-WoR"),col=c("red","blue"),lty=c(1,1))

plot((varhatb1-varhatb2)^2,ylab="e")



# Consider the example in tutorial-1 where we generated a population from a N(5,1) distribution
#and then assumed that it is fixed when we ran the simulations and generated the sample from
#the same population ignoring the fact that those values were generated from a model N(5,1).
#Now we will run a simulation but let the population vary each time as well, i.e., we generate
#a new population each time from N(5,1):

N<-100
n<-10
b<-10000
ybar<-c()
for (j in 1:b) {
  y.p<-rnorm(N,5,1)
  s<-sample(y.p,n)#it would be the same as generating the sample directly by rnorm(10,5,1)
  ybar[j]<-mean(s)
}

#look at the sampling distribution of ybar
hist(ybar)
#estimate of the variance of ybar from the simulation
var(ybar)
1/n#sigma2/n

fixpopvar<-function(popvar,N,n){
  term1<-(N-n)/N
  term2<-popvar/n
  out<-term1*term2
  return(out)
}

fixpopvar(1,100,10)


#Recall example 2, page 17; we will do the same with a larger population and sample size, i.e., we will 
#look at all the possible samples from a fixed population and obtain the exact sampling 
#distribution of the sample mean:

y.p<-1:10#the population consists of numbers 1,2,...,10
mu<-mean(y.p)#population mean
sig2<-var(y.p)#population variance
N<-length(y.p)#population size
n<-5#sample size
M<-choose(N,n)#number of possible samples (without replacement) of size n
C<-combn(y.p,n)#a n*M matrix whose columns are all the possible samples of size n
#to calculate the sample mean for all possible samples we just need to take the mean of
#columns of C:
ybar<-apply(C,2,mean)#the apply function is used to apply the "mean" function to the columns of C
mean(ybar)
mean(ybar)==mu#sample mean is unbiased
pmf<-table(ybar)/M#exact sampling distribution of ybar, i.e., all possible values that ybar
#takes over all possibe samples with the probability mass corresponding to each value
#we can do the same for the sample variance and the estimate of the variance of the sample mean:
plot(pmf,ylab="p(ybar)")
s2<-apply(C,2,var)
mean(s2)
mean(s2)==sig2#sample variance is an unbiased estimator for the population variance
#define a function for calculating the estimate of var(sample mean)
ybarvar<-function(s,N){
  n<-length(s)
  term1<-(N-n)/N
  term2<-var(s)/n
  out<-term1*term2
  return(out)
}
ybar.var<-apply(C,2,ybarvar,N=100)
#practice: Do the same for the estimate of population total and the estimaof its variance 
#as in example 2.