#Tutorial 2




#We will study the variance of the sample mean as:
#1. Population size, N, gets large
#2. As sampling fraction  n/N changes
#3. all possible samples to obtain sample variance as an unbiased estimate of the population variance
#   -  Obtain confidence intervals of smple mean and total population


#Excercise 1.
#generate a list of populations with different sizes
#and calculate mean and variance for 
#each of the populations
n<-7
N_vector<-c(50,100,200,300,400,500)

Zpop<-list() 

mean_pop_vector=rep(NA,length(N_vector))
sigma2_pop_vector=rep(NA,length(N_vector))

for (i in 1:length(N_vector)) {
  Zpop[[i]]<-rnorm(N_vector[i])
  mean_pop_vector[i]<-mean(Zpop[[i]])
  sigma2_pop_vector[i]<-var(Zpop[[i]])
}


#now calculate variances of the sample means
#for each of the populations with different sizes
#sample b samples and calculate means
#then calculate variances of each of the means and make a plot
#study the sample variance as population size N gets bigger
b<-10000
varhatb<-rep(NA,length(N_vector))

for (i in 1:length(N_vector)) {
  ybar<-rep(NA,b)
  for (j in 1:b) {
    s<-sample(Zpop[[i]],n,replace=F)
    ybar[j]<-mean(s)
  }
  varhatb[i]<-var(ybar)
}


plot(N_vector,(varhatb-sigma2_pop_vector/n)^2,ylab="error",xlab="Population size N")


#Excercise 2.
#Question: Why is var(ybar)=(N-n)sigma2/nN instead of var(ybar)=sigma2/n as it was in the math-stat class?!
# We will study this on a population generated from a N(10,1) distribution

#try these values of N and n
#N<-10000
#n<-200

N<-10000
n<-50
b<-10000
ybar<-rep(NA,b)
zpop<-rnorm(N,10,1)
for (j in 1:b) {
  s<-sample(zpop,n)
  ybar[j]<-mean(s)
}

#look at the sampling distribution of ybar
hist(ybar)
#estimate of the variance of ybar from the simulation
var(ybar)
#the population variance is sigma2=1 (because zpop was generated from nrom(10,1))
sigma2=1
#this is the second term of the formula for the variance of var(ybar)=sigma2/n
sigma2/n

#now include the correction factor in the formula for var(ybar)=((N-n)/N)*sigma2/n
correctpopvar<-function(popvar,N,n){
  term1<-(N-n)/N
  term2<-popvar/n
  out<-term1*term2
  return(out)
}
#run the correction of the variance of ybar
correctpopvar(1,N,n)


#Excercise 3.

#Like in example 2, page 16, we will 
#exhaust all the possible samples from a fixed population and obtain the exact sampling 
#distribution of the sample mean:

Zpop<-1:10#the population consists of numbers 1,2,...,10
mu<-mean(Zpop)#population mean
sigma2<-var(Zpop)#population variance
N<-length(Zpop)#population size
n<-5#sample size
M<-choose(N,n)#number of possible samples (without replacement) of size n
Comb<-combn(Zpop,n)#a n*M matrix with all posible samples (of size n) on the columns
#to calculate the sample mean for each possible samples we use the apply function 
#which takes mean for each of the columns (i.e., samples) of the matrix Comb
ybar<-apply(Comb,2,mean)

#look at the distribution of ybar
hist(ybar)
#compare the mean of the ybar to the population mean
mean(ybar)
(mean(ybar)==mu)#sample mean is unbiased

#we can do the same for the sample variance and the estimate of the variance of the sample mean:
s2<-apply(Comb,2,var)
#sample variance is an unbiased estimator of the population variance
mean(s2)==sigma2

#exact sampling distribution of ybar, i.e., the frequency of each value that ybar takes
#normalized by the number of all possibe samples
pmf<-table(ybar)/M
plot(pmf,ylab="p(ybar)")

#define a function for calculating the estimate of var(sample mean)
ybarvar<-function(smpl,N){
  n<-length(smpl)
  term1<-(N-n)/N
  term2<-var(smpl)/n
  out<-term1*term2
  return(out)
}
#calculate variance of each of the sample means
#and use it to obtin 95% confidence interval
#now we use the user specified function (ybarvar) to be applied to each column of the matrix Comb
#Note that first argument of ybarvar is smpl and when ybarvar is called from 
#the apply function the argument smpl is passed as a column of the Comb function
#so the first argument in the user specified function sould correspond to what we have
#on the columns (or could be rows) of the matrix (in this case Comb) which is passed 
#through the apply function. The second argument of ybarvar is passed after the function (i.e, ybarvar,N=10)
#
ybar.var<-apply(Comb,2,ybarvar,N=N)

mat_95CI_ybar.var=cbind(ybar+qt(0.025,n-1)*sqrt(ybar.var),ybar+qt(0.975,n-1)*sqrt(ybar.var))
# The matrix mat_95CI_ybar.var contains confidence intervalsfor each of the
# sample means stored in the ybar variable

# Repeat the same procedure to obtain estimate of population total and the estimate of its variance 
tau_hat=N*ybar

mat_95tau_hat=cbind(tau_hat+qt(0.025,n-1)*N*sqrt(ybar.var),ybar+qt(0.975,n-1)*N*sqrt(ybar.var))

#alternatively the variance of the total population estimator could be estimated by this function
tau_hat_var_hat<-function(ybar.var,N){
  out=N*sqrt(ybar.var)
  return(out)
}
mat_95tau_hat=cbind(tau_hat+qt(0.025,n-1)*tau_hat_var_hat(ybar.var,N),tau_hat+qt(0.975,n-1)*tau_hat_var_hat(ybar.var,N))