#Tutorial-3

# Example 1: See if CLT holds for SRS without replacement on the islands data set
# Example 2:- Estimate coverage probabilities for 95% CI
#           - Calculate coverage probabilities for several sample sizes and make plots

rm(list=ls())
#Example 1:
#we will use the data set islands which is part of R basic dataset package
#Finite Population CLT in SRS without replacement
zpop<-islands####The areas in thousands of square miles of the landmasses which
#exceed 10,000 square miles.##################
hist(zpop)
mu<-mean(zpop)
sigma2<-var(zpop)
N<-length(zpop)
n<-7

smpl_index<-sample(1:N,n)
s<-zpop[smpl_index]


#### See if it holds finite population CLT for SRS without replacement
smpl_mean_var <- function(zpop,n,b) {
  N<-length(zpop)
  sigma2<-var(zpop)
  ybar<-rep(NA,b)
  for (i in 1:b) {
    s<-sample(1:N,n)
    ybar[i]<-mean(zpop[s])
  }
  ybar_var<-((N-n)/N)*(sigma2/n)
  out<-list(ybar=ybar,ybar_var=ybar_var)
  return(out)
}

#now use the function  smpl_mean_var to simulate sample mean from b samples
#of sample sizes c(5,10,15,24,35,45)
#and plot the standardized ybar against standard normal to see if they match
par(mfrow=c(2,3)) # splits the screen into a matrix with dimension (2,3)
#so that we can accomodate histograms in each of the cells of the matrix
b<-10000
for (n in c(5,10,15,24,35,45)) {
  #simulate a vector of sample means of b samples of size n from zpop
  simul<-smpl_mean_var(zpop,n,b)
  ybar_standardize<-(simul[[1]]-mu)/sqrt(simul[[2]])
  #see the distribution of the stanardized ybar
  hist(ybar_standardize,prob=T,ylim=c(0,1.2),main=n)
  #create a grid of length 100
  z<-seq(min(ybar_standardize),max(ybar_standardize),length=100)
  #evaluate standard normal distribution at z values from the grid
  lines(z,dnorm(z),col="red") # the lines function assumes that there is already a plot or histogram
                              # so it just drws a line on the existing histogram
  }

#Q: Does the histograms follow the standard normal (red line) as sample size n increases?
#i.e., does the CLT hold under the SRS without replacement?



#Example 2:
#Check this web site for data sets
#https://vincentarelbundock.github.io/Rdatasets/datasets.html

#Estimate the exact coverage probability for the approximate (1-alpha)% CIs through simulation

catsDataSet<-read.table(file="catsM.csv",header=T,sep = ",") 


head(catsDataSet) # the head function returns the first few rows of the data set together with the 
               # structure of the table
#three columns: Sex (F or M), Bwt (Body weight in kg), Hwt (heart weight in g)
#lets take the heart weight and use it as our population
zpop<-catsDataSet$Hwt #population
mu<-mean(zpop) #population mean
N<-length(zpop) #population size
sigma2=var(zpop)
alpha<-.05 #so that (1-alpha) is .95
n<-7 #sample size
b<-10000 #number of iterations in simulation
counter_true<-0
lower<-rep(NA,b)
upper<-rep(NA,b)
for (i in 1:b) {
  smpl_index<-sample(1:N,n)
  s=zpop[smpl_index]
  ybar<-mean(s)
  var_hat_ybar<-((N-n)/N)*var(s)/n
  lower[i]<-ybar-qnorm((1-alpha/2))*sqrt(var_hat_ybar)
  upper[i]<-ybar+qnorm((1-alpha/2))*sqrt(var_hat_ybar)
  if (mu>lower[i] & mu<upper[i]) counter_true<-counter_true+1 #check if the 95%CI covers the true mean i.e., population mean mu
}
#counter_true is a counter of all the b confidence intervals which contained the true mean (population mean)
cover_prob<-counter_true/b #estimated coverage probability which is a proportion of the intervals that contained the poulation mean


#See how the coverage probability changes with changes of n
coverage_prob<-function(zpop,n){
  N=length(zpop)
  mu<-mean(zpop)
  counter_true<-0
  lower<-rep(NA,b)
  upper<-rep(NA,b)
  for (i in 1:b) {
    smpl_index<-sample(1:N,n)
    s=zpop[smpl_index]
    ybar<-mean(s)
    var_hat_ybar<-((N-n)/N)*var(zpop)/n
    delta<-qnorm((1-alpha/2))*sqrt(var_hat_ybar)
    lower[i]<-ybar-delta
    upper[i]<-ybar+delta
    if (mu>lower[i] & mu<upper[i]) counter_true<-counter_true+1 #check if the CI covers the true mean (population mean)
  }
  cover_prob<-counter_true/b
  return(cover_prob)
}

#calculate coverage probability for different n 
ngrid=seq(10,100,length=20)
cp<-rep(NA,length(ngrid))

for (n in (1:length(ngrid))) {
  cp[n]<-coverage_prob(zpop,n)
}

#save the coverage probability plots in .png file
#the .png file will be saved in the working directory
#if you want to change the directory just put a full path 
#i.e.,instead of CoverageP.png put a full path
png('CoverageP.png')
par(mfrow=c(1,1))
plot(ngrid,cp,ylim=c(.92,.98),pch=19, xlab='n',ylab='Coverage prob.',main='Coverage probabilities with respect to n')
abline(h=.95,col="red")
dev.off()