#Comparison between multiple regression and PCA regression
#performed on BullsData.

setwd("C:\\Users\\authorized user\\Dropbox\\STAT445\\week5\\tutorials")

rm(list=ls())
# How would you convert 56 degrees, 45 minutes and 59 seconds to degrees in decimal format?
#
# 56+45/60+59/3600
# substr("34545", 3, 5) 

cor(Data.matrix[,2:9])


#read the data
(Data.matrix<-read.csv("BullsData.csv"))


cor(Data.matrix[2:9])

#calculate the correlation matrix
round(cor(Data.matrix[,c(3,4,5)]),4)


#Extract the variables from the dataset, which we would like to include in our analysis
SaleHt     = Data.matrix[,8]
YrHgt      = Data.matrix[,3]
FtFrBody   = Data.matrix[,4]
PrctFFB     = Data.matrix[,5]


#fit the linear model 
full.model.fit = lm(SaleHt ~ YrHgt + FtFrBody + PrctFFB)



#see the summary of the linear model
summary(full.model.fit)
####################################################################################
#What do you see from the summary?
#which of the variables are not significant?

####################################################################################



####################################################################################
#drop1 function gives anova type table (with calculated RSS (residual sum of squares),
#SS (sum of squares) and AIC criteria)
#calculated from different models, obtained with exclusion of each of the explanatory
#variable. So it returns list of RSS, SS and AIC information criterion for each of the variables.
#So, from this table we look for the model with least RSS, because ultimately we want the 
#explanatory variables to explain biggest part of the variation in the dependent variable and
#because RSS measures how big part of the variation in the dependent variable s not explained, we 
#choose the model with lowest RSS
#
#Note: For each of the variables does not mean that the three measures are calculated on the 
#variable level; the three measures are calculated for different fitted models in which the respective
#variable is excluded from the model.
#
#################################################################################### 

drop1(full.model.fit)
#################################################################################### 
#The smallest RSS is for the model where PrctFFB variable is excluded
#So this should be a variable that we should exclude from the model
#the same conclusion was reached from the summary(full.model.fit)
#where we can see that PrctFFB is not significant
#This can be also checked by refitting the model without PrctFFB
#################################################################################### 

full.model.fit1 = lm(SaleHt ~ YrHgt + FtFrBody)

#################################################################################### 
#summary of the refitted model shows that both explanatory variables are significant now
#################################################################################### 
summary(full.model.fit1)

#################################################################################### 
#the drop1 function of the refitted model shows not much difference between RSS
#################################################################################### 

drop1(full.model.fit1)

#################################################################################### 
#Now, lets perform linear regression on PCA
#################################################################################### 

#obtain eigen values and vectors from the correlation matrix
eigens = eigen(cor(Data.matrix[,c(3,4,5)]))

(evals = eigens$values)
(evecs = eigens$vectors)

#what is this? why is this an integer? 
#sum should equal the # of variables
sum(evals)

#PC proportion of the variance with respect to the total variance
(proportion = evals/sum(evals))

#direction of the third PC:
#relatively large weights for the second and the third variable 
#but with oposite signs. So these are the directions of 
#decreasing the second and increasing of the third variable
evecs[,3]


###Standardizing the variables
YrHgt.std        =  (YrHgt    - mean(YrHgt))/sd(YrHgt)
FtFrBody.std     =  (FtFrBody - mean(FtFrBody))/sd(FtFrBody)
PrctFFB.std      =  (PrctFFB  - mean(PrctFFB))/sd(PrctFFB)



StdMatrix<- cbind(YrHgt.std,FtFrBody.std,PrctFFB.std)


#covariance of the standardized data should equal to 
#correlation matrix of the unstandardized data
round(cov(StdMatrix),4)


#now construct PC
PC = StdMatrix %*% evecs


#redo the PCA
#this is essentially the identity matrix,
#so nothing changes if you repeat the PCA
#on the previously performed PCA
#This is because PC has been aligned with
#appropriate axes
eigen(cov(PC))



PC1=PC[,1]
PC2=PC[,2]
PC3=PC[,3]
#fit the linear regression model on PC
PC.reg <- lm(SaleHt ~ PC1+PC2+PC3)

summary(PC.reg)

PC.reg1 <- lm(SaleHt ~ PC1+PC2)
summary(PC.reg1)
##########################################################
#How many PC would you keep in this model?
#Is there any PC whose direction is not useful for predicting the SaleHt
#in which direction is that?
##########################################################