#Comparison between multiple regression and PCA regression #performed on BullsData. setwd("C:\\Users\\authorized user\\Dropbox\\STAT445\\week5\\tutorials") rm(list=ls()) # How would you convert 56 degrees, 45 minutes and 59 seconds to degrees in decimal format? # # 56+45/60+59/3600 # substr("34545", 3, 5) cor(Data.matrix[,2:9]) #read the data (Data.matrix<-read.csv("BullsData.csv")) cor(Data.matrix[2:9]) #calculate the correlation matrix round(cor(Data.matrix[,c(3,4,5)]),4) #Extract the variables from the dataset, which we would like to include in our analysis SaleHt = Data.matrix[,8] YrHgt = Data.matrix[,3] FtFrBody = Data.matrix[,4] PrctFFB = Data.matrix[,5] #fit the linear model full.model.fit = lm(SaleHt ~ YrHgt + FtFrBody + PrctFFB) #see the summary of the linear model summary(full.model.fit) #################################################################################### #What do you see from the summary? #which of the variables are not significant? #################################################################################### #################################################################################### #drop1 function gives anova type table (with calculated RSS (residual sum of squares), #SS (sum of squares) and AIC criteria) #calculated from different models, obtained with exclusion of each of the explanatory #variable. So it returns list of RSS, SS and AIC information criterion for each of the variables. #So, from this table we look for the model with least RSS, because ultimately we want the #explanatory variables to explain biggest part of the variation in the dependent variable and #because RSS measures how big part of the variation in the dependent variable s not explained, we #choose the model with lowest RSS # #Note: For each of the variables does not mean that the three measures are calculated on the #variable level; the three measures are calculated for different fitted models in which the respective #variable is excluded from the model. # #################################################################################### drop1(full.model.fit) #################################################################################### #The smallest RSS is for the model where PrctFFB variable is excluded #So this should be a variable that we should exclude from the model #the same conclusion was reached from the summary(full.model.fit) #where we can see that PrctFFB is not significant #This can be also checked by refitting the model without PrctFFB #################################################################################### full.model.fit1 = lm(SaleHt ~ YrHgt + FtFrBody) #################################################################################### #summary of the refitted model shows that both explanatory variables are significant now #################################################################################### summary(full.model.fit1) #################################################################################### #the drop1 function of the refitted model shows not much difference between RSS #################################################################################### drop1(full.model.fit1) #################################################################################### #Now, lets perform linear regression on PCA #################################################################################### #obtain eigen values and vectors from the correlation matrix eigens = eigen(cor(Data.matrix[,c(3,4,5)])) (evals = eigens$values) (evecs = eigens$vectors) #what is this? why is this an integer? #sum should equal the # of variables sum(evals) #PC proportion of the variance with respect to the total variance (proportion = evals/sum(evals)) #direction of the third PC: #relatively large weights for the second and the third variable #but with oposite signs. So these are the directions of #decreasing the second and increasing of the third variable evecs[,3] ###Standardizing the variables YrHgt.std = (YrHgt - mean(YrHgt))/sd(YrHgt) FtFrBody.std = (FtFrBody - mean(FtFrBody))/sd(FtFrBody) PrctFFB.std = (PrctFFB - mean(PrctFFB))/sd(PrctFFB) StdMatrix<- cbind(YrHgt.std,FtFrBody.std,PrctFFB.std) #covariance of the standardized data should equal to #correlation matrix of the unstandardized data round(cov(StdMatrix),4) #now construct PC PC = StdMatrix %*% evecs #redo the PCA #this is essentially the identity matrix, #so nothing changes if you repeat the PCA #on the previously performed PCA #This is because PC has been aligned with #appropriate axes eigen(cov(PC)) PC1=PC[,1] PC2=PC[,2] PC3=PC[,3] #fit the linear regression model on PC PC.reg <- lm(SaleHt ~ PC1+PC2+PC3) summary(PC.reg) PC.reg1 <- lm(SaleHt ~ PC1+PC2) summary(PC.reg1) ########################################################## #How many PC would you keep in this model? #Is there any PC whose direction is not useful for predicting the SaleHt #in which direction is that? ##########################################################