### Agenda: Installing and using packages Multiple Testing with Fisher LSD Multiple Testing with Tukey HSD Randomization Test for ANOVA #### Not included... next time? At all? ###ANOVA without the equal variance assumption with oneway.test ######### INSTALLING AND USING PACKAGES ## See installation instructions in the notes for pictures ## Packages --> Install Package(s) --> Choose Mirror (Canada BC) --> Choose Packages (agricolae) library(agricolae) # add "agricolae" to the library ## This is essentially the same thing you do when loading a pre-made dataset data(InsectSprays) # build-in, not part of agricolae package ######### MULTIPLE TESTING WITH FISHER LSD ## (Page 99-100) ## First, we'll need a data set. This is from Exercise 3.27. ## It's the concentration of a chemical after beign exposed to one of four catalysts. ## Note that not every sample is the same size. conc = c(58.2,57.2,58.4,55.8,54.9,56.3,54.5,57.0,55.3,50.1,54.2,55.4,52.9,49.9,50.0,51.7) catl = c(1,1,1,1,1,2,2,2,2,3,3,3,4,4,4,4) catl = as.factor(catl) data327 = data.frame(catl,conc) ## The command for Fisher's LSD (Least Significant Difference) is part of the agricolae package ## So make sure agricolae is installed and loaded first. ### You need to put a model into the LSD.tst() function. But either aov or lm works. mod1 = aov(conc ~ catl, data=data327) LSD.test(mod1,"catl") ### lm() will give you the same results as aov() mod2 = lm(conc ~ catl, data=data327) LSD.test(mod2,"catl") ### This does a t-test on each possible pair. ### It needs an AoV or linear model to get the MSE, which it uses as the variance ### estimate for every pair instead of getting the pooled variance for each one. ### This way the variance is pooled among ALL groups and you get N - a degrees of freedom. ### One issue is that it's doing 4 choose 2, or 6 distinct t-tests. ### If each one has a false positive rate of alpha = 0.05, then the chance ### of at least one of the t-tests giving a false positive is much higher than 0.05. ### (How much higher depends on the assumption of the t-tests being independent) ### However, we can use a multiple comparison adjustment to reduce the chances of ### or there being a false positive to close to that of alpha. ### The simplest way is the Bonferroni method. LSD.test(mod1,"catl",p.adj="bonferroni") ## Only two distinct groups are found now, instead of three. ## The Bonferroni method reduces the p-value of each test by a factor of the number of tests. ## There are six tests, so the p-value is reduced to 0.05 / 6 here. ## Under the assumpting of independent tests, the expected number of false positives is alpha. ## There are other methods, but I suggest you read up on them before using them ## in case you need to justify your decisions later. LSD.test(mod1,"catl",p.adj="fdr") #False discovery rate LSD.test(mod1,"catl",p.adj="holm") # Holm LSD.test(mod1,"catl",p.adj="BH") # Bonferroni-Holm ########################## Multiple Testing with Tukey HSD #(Page 98) ## HSD stands for Honest Significant Difference. ## It's similar to the LSD, but has a built it correction for multiple testing ## that doesn't require an adjustment to the p-value. HSD.test(mod1,"catl") ## It's set up and interpreted the same way as the LSD method. It doesn't require a ## package, but the agricolae package does offer its own version of it. ######################### Randomization Test for ANOVA ### Here we use code from the randomization test in lab 2 and expand it to account for more than 2 groups. ######################### ## ANOVA Randomization test p.77-78 ## First, we need the means every group. ## One fast way to do that is with the by() command ## by (response variable, grouping variable, function) real.means = by(data327$conc, data327$catl, mean) #1. Get the SS.total #2. Get the SS.treatment ## For each of many runs... #### 3. Randomly assign the treatments to the responses #### 4. Get the SS.treatment for each response # (end for loop) # 5. The proportion of randomizations that have a bigger SS.treatment # Extra. Get the F-Stat from each ss.treatment compare to f-dist #1. Get the SS.total from (3.8) on page 74 y = data327$conc trt = data327$catl N = length(y) ## N is the total number of observations a = length(unique(trt)) ## a is the total number of different treatments y.. = sum(y) #Real.SS.Total = sum(y^2) - (y..^2)/N Real.SS.Total = sum( (y - mean(y))^2 ) #2. Get the SS.treatment from (3.9) on page 74 ## Note that formulae will have to be modified because of the unequal sample sizes yi. = by(y, trt, sum) ni = by(y, trt, length) yi._bar = yi. / ni y.._bar = mean(y) #Real.SS.Treatment = sum(yi.^2/ni) - (y..^2)/N Real.SS.Treatment = sum( ni*(yi._bar - y.._bar)^2) Real.SS.Error = Real.SS.Total - Real.SS.Treatment ## For each of many runs... rnd.SS.Treatment = rep(NA,10000) for(run in 1:10000) { #### 3. Randomly assign the treatments to the responses random.trt = sample(trt) #### 4. Get the SS.treatment for each response yi. = by(y, random.trt, sum) yi._bar = yi. / ni #rnd.SS.Treatment[run] = sum(yi.^2/ni) - (y..^2)/N rnd.SS.Treatment[run] = sum( ni*(yi._bar - y.._bar)^2) } # (end for loop) # 5. The proportion of randomizations that have a bigger SS.treatment ### Make a list of the runs that produced a larger SStreatment than the observed one random.beat.real = which(rnd.SS.Treatment > Real.SS.Treatment) ### Find how many there are out of 10000 p = length(random.beat.real) / 10000 # Extra. Get the F-Stat from each ss.treatment compare to f-dist rnd.SS.Error = Real.SS.Total - rnd.SS.Treatment rnd.MS.Treatment = rnd.SS.Treatment / (a - 1) #using the a from above rnd.MS.Error = rnd.SS.Error / (N - a) # using N and a from above rnd.F = rnd.MS.Treatment / rnd.MS.Error ### How does our random F compare to the real thing? hist(rnd.F, freq=FALSE, xlim=c(0,6),n=80) curve(df(x, df1=(a-1), df2=(N-a)),from=0,to=6,add=TRUE,lwd=2) ### Evidence that in a one-factor ANOVA, F is is strictly increasing over SS.Treatment plot(rnd.F ~ rnd.SS.Treatment)