# This will be used as an example of creating a notebook in HTML, DOC, or PDF formats.
# See
# http://rmarkdown.rstudio.com/articles_report_from_r_script.html
# for more informaton
options(useFancyQuotes=FALSE) # renders summary output corrects
#source("schwarz.functions.r")
source('http://www.stat.sfu.ca/~cschwarz/Stat-650/Notes/MyPrograms/schwarz.functions.r')
# This is a quick demo of using Rstudio
x <- 1:10
x
## [1] 1 2 3 4 5 6 7 8 9 10
plot(x,x)

# This script will read in the cereal data set,
# do a simple listing,
# fit a regression line,
# draw a scatter plot and add the line to the plot
# do a single factor crd anova
# get the compact letter display
# make some plots
# load required libraries
library(ggplot2)
library(emmeans)
library(readxl)
# Read in the cereal data from a csv file
cereal <- read.csv('cereal.csv',
header=TRUE, as.is=TRUE, strip.white=TRUE)
cereal2 <- readxl::read_excel('ALLofDATA.xls',
sheet='cereal',
skip=7)
names(cereal2) <- make.names(names(cereal2))
# Define new variables and factors (for categorical variables). CHeck the structure of the data frame
cereal$shelfF <- factor(cereal$shelf)
cereal$Calories.fr.Protein <- cereal$protein * 4;
str(cereal)
## 'data.frame': 77 obs. of 17 variables:
## $ name : chr "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
## $ mfr : chr "N" "Q" "K" "K" ...
## $ type : chr "C" "C" "C" "C" ...
## $ calories : int 60 110 80 50 110 110 110 140 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ potass : int 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins : int 25 0 25 25 25 25 25 25 25 25 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.331 NA 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ shelfF : Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...
## $ Calories.fr.Protein: num 16 12 16 16 8 8 8 12 8 12 ...
# List the first few records
cereal[1:5,]
## name mfr type calories protein fat sodium fiber
## 1 100%_Bran N C 60 4 1 130 10
## 2 100%_Natural_Bran Q C 110 3 5 15 2
## 3 All-Bran K C 80 4 1 260 9
## 4 All-Bran_with_Extra_Fiber K C 50 4 0 140 14
## 5 Almond_Delight R C 110 2 2 200 1
## carbo sugars shelf potass vitamins weight cups shelfF
## 1 5 6 3 280 25 1 0.331 3
## 2 8 8 3 135 0 1 NA 3
## 3 7 5 3 320 25 1 0.330 3
## 4 8 0 3 330 25 1 0.500 3
## 5 14 8 3 NA 25 1 0.750 3
## Calories.fr.Protein
## 1 16
## 2 12
## 3 16
## 4 16
## 5 8
# List some variables
cereal$calories
## [1] 60 110 80 50 110 110 110 140 90 90 120 110 130 100 110 110 110
## [18] 100 110 110 100 100 90 100 100 110 90 120 130 100 100 100 100 110
## [35] 110 130 110 120 100 140 100 100 110 110 150 150 160 90 120 140 90
## [52] 130 130 90 40 50 100 90 120 90 90 110 100 80 80 90 110 100
## [69] 80 100 150 110 100 110 100 90 110
cereal[,"calories"]
## [1] 60 110 80 50 110 110 110 140 90 90 120 110 130 100 110 110 110
## [18] 100 110 110 100 100 90 100 100 110 90 120 130 100 100 100 100 110
## [35] 110 130 110 120 100 140 100 100 110 110 150 150 160 90 120 140 90
## [52] 130 130 90 40 50 100 90 120 90 90 110 100 80 80 90 110 100
## [69] 80 100 150 110 100 110 100 90 110
cereal$fat
## [1] 1 5 1 0 2 2 0 2 1 0 2 2 3 2 1 0 0 0 1 3 0 0 1 0 1 0 0 2 0 1 0 1 1 0 3
## [36] 2 1 0 1 1 1 2 1 1 3 3 2 1 1 2 0 2 1 0 0 0 1 2 1 2 0 0 0 0 0 0 1 0 0 1
## [71] 1 1 1 1 1 1 1
cereal[1:5,c("name","fat","calories")]
## name fat calories
## 1 100%_Bran 1 60
## 2 100%_Natural_Bran 5 110
## 3 All-Bran 1 80
## 4 All-Bran_with_Extra_Fiber 0 50
## 5 Almond_Delight 2 110
# Make a basic scatter plot
plotbasic <- ggplot(data=cereal, aes(x=fat, y=calories))+
ggtitle("Calories vs Fat in cereals")+
xlab("Grams of Fat")+ylab("Calories/serving")+
geom_point()
plotbasic

plotbasic2 <- ggplot(data=cereal, aes(x=fat, y=calories))+
ggtitle("Calories vs Fat in cereals")+
xlab("Grams of Fat")+ylab("Calories/serving")+
geom_jitter()
plotbasic2

# Same plot in base R graphics (ugh) Try to avoid using Base R graphics
plot(jitter(cereal$fat), jitter(cereal$calories),
main="Plot of calories vs. grams of fat",
xlab="Grams of fat", ylab='Calories/servince')

# Fit a regression between calories and grams of fat
fit.calories.fat <- lm( calories ~ fat, data=cereal)
summary(fit.calories.fat)
##
## Call:
## lm(formula = calories ~ fat, data = cereal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.132 -5.132 4.868 14.868 45.256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.132 3.141 30.285 < 2e-16 ***
## fat 9.806 2.207 4.443 3.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.36 on 75 degrees of freedom
## Multiple R-squared: 0.2084, Adjusted R-squared: 0.1978
## F-statistic: 19.74 on 1 and 75 DF, p-value: 3.009e-05
anova(fit.calories.fat) # careful Type I SS
## Analysis of Variance Table
##
## Response: calories
## Df Sum Sq Mean Sq F value Pr(>F)
## fat 1 7402.9 7402.9 19.743 3.009e-05 ***
## Residuals 75 28121.8 375.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(fit.calories.fat)
## (Intercept) fat
## 95.131579 9.806005
sqrt(diag(vcov(fit.calories.fat))) # extract the SE
## (Intercept) fat
## 3.141224 2.206897
confint(fit.calories.fat) # confidence intervals on parameters
## 2.5 % 97.5 %
## (Intercept) 88.873939 101.38922
## fat 5.409642 14.20237
names(summary(fit.calories.fat))
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
summary(fit.calories.fat)$r.squared
## [1] 0.2083875
summary(fit.calories.fat)$sigma
## [1] 19.36381
class(fit.calories.fat)
## [1] "lm"
methods(class=class(fit.calories.fat))
## [1] add1 alias anova case.names
## [5] coerce confint cooks.distance deviance
## [9] dfbeta dfbetas drop1 dummy.coef
## [13] effects emm_basis extractAIC family
## [17] formula fortify hatvalues influence
## [21] initialize kappa labels logLik
## [25] model.frame model.matrix nobs plot
## [29] predict print proj qr
## [33] recover_data residuals rstandard rstudent
## [37] show simulate slotsFromS3 summary
## [41] variable.names vcov
## see '?methods' for accessing help and source code
# Add the fitted line to the scatter plot; and save
plotline <- plotbasic2 +
geom_abline(intercept=coef(fit.calories.fat)[1],
slope =coef(fit.calories.fat)[2])
plotline

# Or, if you don't want' to do the actual fit, use ggplot directly
plot.calories.fat <- ggplot(data=cereal, aes(x=fat, y=calories)) +
geom_jitter(shape=1) + # Use hollow circles
geom_smooth(method=lm, # Add linear regression line
se=FALSE) # Don't add shaded confidence region
plot.calories.fat

# Make a nicer scatter plot and add the fitted line in base R graphics. Ugh. Not recommended to use Base R graphics
png("cal-vs-fat3-base.png")
plot(jitter(cereal$fat), jitter(cereal$calories),
main="Plot of calories vs. grams of fat",
xlab="Grams of fat", ylab='Calories/servince')
abline(fit.calories.fat)
dev.off()
## quartz_off_screen
## 2
# Do a simple single factor ANOVA
# Is the mean number of calories the same for all shelves
# Need to use a FACTOR variable for the categorical variable
fit.sugars.shelf <- lm( sugars ~ shelfF, data=cereal)
anova(fit.sugars.shelf)
## Analysis of Variance Table
##
## Response: sugars
## Df Sum Sq Mean Sq F value Pr(>F)
## shelfF 2 220.23 110.117 6.6013 0.002316 **
## Residuals 73 1217.71 16.681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Estimate the marginal means along with confidence limits and Tukey multiple comparison.
fit.sugars.shelf.lsmo <- emmeans::emmeans(fit.sugars.shelf, ~shelfF)
fit.sugars.shelf.cld <- CLD(fit.sugars.shelf.lsmo, adjust='tukey')
fit.sugars.shelf.cld
## shelfF emmean SE df lower.CL upper.CL .group
## 1 5.105263 0.9369889 73 2.815493 7.395034 1
## 3 6.527778 0.6807066 73 4.864298 8.191257 1
## 2 9.619048 0.8912542 73 7.441041 11.797054 2
##
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 3 estimates
## P value adjustment: tukey method for comparing a family of 3 estimates
## significance level used: alpha = 0.05
cld.plot <- sf.cld.plot.bar(fit.sugars.shelf.cld, "shelfF", order=FALSE)
cld.plot

# Estimate the pairwise differences
pairs(fit.sugars.shelf.lsmo)
## contrast estimate SE df t.ratio p.value
## 1 - 2 -4.513784 1.293168 73 -3.490 0.0023
## 1 - 3 -1.422515 1.158149 73 -1.228 0.4405
## 2 - 3 3.091270 1.121470 73 2.756 0.0199
##
## P value adjustment: tukey method for comparing a family of 3 estimates