Skip to content

Instantly share code, notes, and snippets.

@nickpettican
Last active October 28, 2019 05:49
Show Gist options
  • Save nickpettican/cac4be02e8da391ad159 to your computer and use it in GitHub Desktop.
Save nickpettican/cac4be02e8da391ad159 to your computer and use it in GitHub Desktop.
# ANOVA: ANALYSIS OF VARIANCE
# Regression and ANOVA are identical approaches except for the nature of the explanatory variables
# STATISTICAL BACKGORUND
# The emphasis of ANOVA is on hypothesis testing
# To estimate means and standard errors of differences between means
# Comparing two means by t-test: (difference between two means)/standard error of the difference -> compare with critical value
# In ANOVA we compare 3 or more means
# e.g.
# SST (total variation overall mean value of y) = sum of the squares of the differences
# SSE (error sum of squares) = sum of squares o fthe differences between each y value and its own treatment mean
# if the two means were the same SSE = SST
# if they are different => treatment sum of squares => SSA = SST - SSE
# to analyse variance => convert the sums of squares into variances by dividing by their degrees of freedom
# we might have k levels of any factor hence k-1 degrees of freedom for SSA
# if each factor level were replicated n times there would be n-1 d.f. for error within each level
# so k(n-1) d.f. for error in the whole
# Treatment - SSA - k-1 d.f.
# Error - SSE - k(n-1) d.f.
# Total - SST - kn-1 d.f.
# each element in the sums of squares is divided by d.f. to give variances in the mean square
# significance is assessed using F test
# treatment variance MSA is divided by error variance s^2 -> converted to a p value using pf
# if p value < significance threshold (0.05) we REJECT the null hypothesis
# if p value > significance threshold (0.05) -> could have arisen by chance, we ACCEPT the null hypothesis
# ONE WAY ANOVA
oneway<-read.table("c:\\MSc\\Statistics\\Data\\oneway.txt",header=T)
attach(oneway)
head(oneway)
tapply(Growth,Photoperiod,mean)
# calculate the mean growth at each photoperiod
Photoperiod<-ordered(Photoperiod,levels=c("Very.short","Short","Long","Very.long"))
# declare photoperiod to be an ordered factor
tapply(Growth,Photoperiod,mean)
mean(Growth)
# the grand mean
SST<-sum((Growth-mean(Growth))^2)
SST
# to work out the total sum of squares
tapply(Growth,Photoperiod,length)
rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6)
# to se the values of yi − y
SSA<-sum(rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6)^2)
SSA
# now we have SSA
Dvs<-Growth[Photoperiod=="Very.short"]-mean(Growth[Photoperiod=="Very.short"])
Dvs
Ds<-Growth[Photoperiod=="Short"]-mean(Growth[Photoperiod=="Short"])
Ds
Dl<-Growth[Photoperiod=="Long"]-mean(Growth[Photoperiod=="Long"])
Dl
Dvl<-Growth[Photoperiod=="Very.long"]-mean(Growth[Photoperiod=="Very.long"])
Dvl
# these calculate the residuals: the differences between the observed growth and the average for that photoperiod
sum(Dvs^2)
sum(Ds^2)
sum(Dl^2)
sum(Dvl^2)
SSE<-3.333333+6.833333+9.5+12.83333
SSE
# sums of squares of these differences
SST-SSA
# a quicker way of working out SSE
# R has a quicker way of doing a one-way analysis of variance using "lm"
Response.variable~Explanatory.variable
# the explanatory variable is a factor
# to check if a variable is a factor we use
is.factor(Photoperiod)
one.way<-lm(Growth~Photoperiod)
anova(one-way)
# this carries out the ANOVA test once we know if the variable is a factor
summary(one.way)
# can also be used on "lm" models
# TWO-WAY ANOVA
rm(Photoperiod)
detach(oneway)
twoway<-read.table("c:\\MSc\\Statistics\\Data\\twoway.txt",header=T)
attach(twoway)
head(twoway)
tapply(Growth,Genotype,mean)
# compares the mean growth rates of 6 genotypes
tapply(Growth,Genotype,length)
# removes the variation attributable to Genotype, makes the error variance smaller and difference between Photoperiod means greater
SSB<-sum(rep(tapply(Growth,Genotype,mean)-mean(Growth),4)^2)
SSB
# now we can begin the two-way ANOVA
# THE EASY WAY
two.way<-lm(Growth~Genotype+Photoperiod)
anova(two.way)
# includes Genotype in the model
# the difference between the Photoperiod means is now highly significant
# ASSUMPTIONS OF ANOVA
# random sampling, equal variances, independence of errors, normal distribution of errors, additivity of treatment effects
# RANDOM SAMPLING
# sampling needs to be done randomly otherwise they will probably be biased and their interpretation is bound to be equivocal
# EQUAL VARIANCES
# ANOVA assumes that the sampling errors do not differ significantly from one treatment to another
# TWO-WAY FACTORIAL ANALYSIS OF VARIANCE
factorial<-read.table("c:\\MSc\\Statistics\\Data\\factorial.txt",header=T)
attach(factorial)
factorial
# calculation of the interaction sum of squares
tapply(growth,diet,mean)
tapply(growth,diet,length)
# working out SSA, sum of squares for diet
as.vector(tapply(growth,diet,mean))-mean(growth)
# differences between the overall mean and the means on the different diets
4*sum((as.vector(tapply(growth,diet,mean))-mean(growth))^2)
# sum of squares of these differences for all 12 individuals
tapply(growth,coat,mean)
tapply(growth,coat,length)
as.vector(tapply(growth,coat,mean))-mean(growth)
6*sum((as.vector(tapply(growth,coat,mean))-mean(growth))^2)
# works out SSB, sum of squares for coat
tapply(growth,list(coat,diet),mean)
tapply(growth,list(coat,diet),length)
# work out the mean for each paired replicate with the same diet and coat colour and the number of replicates
tapply(growth,list(coat,diet),mean)-mean(growth)
# difference between these means and the overall mean growth
2*sum((as.vector(tapply(growth,list(coat,diet),mean))-mean(growth))^2)-2.613333-2.66
# subtract the sum of squares for the main effects (SSA and SSB) from the sum of squares of this bigger difference to get SSAB
# TWO-WAY FACTORIAL ANALYSIS OF VARIANCE IN R
attach(factorial)
model<-lm(growth~diet*coat)
anova(model)
model2<-update(model , ~ . - diet:coat)
# model simplification to remove any non-significant parameters
# the new model (model2) gets an update of the earlier model
# ,~.- means to take the whole of "model" and remove from it the interaction diet:coat
anova(model,model2)
# model2 is not significantly different in its explanatory power than the more complicated model
anova(model2)
# hint of an effect of diet (p=0.0719)
model3<-update(model2, ~. -diet)
anova(model2,model3)
# gives the same p value
anova(model3)
# when diet is left out of the model the effect of coat is insignificant
tapply(growth,diet,mean)
# diets A and B produce a similar response, but diet C seems to have more effect
diet2<-factor(1+(diet=="C"))
# all diets are represented as 1 except diet C -> 2
model4<-update(model3, ~. +diet2)
anova(model3,model4)
# by reducing the levels of diet from 3 to 2 has turned a non-significant effect into a significant one
model5<-update(model4, ~. +diet2:coat)
anova(model4,model5)
# there is no interaction with the new simpler factor
# model 4 is the minimal adequate model
# THREE-WAY FACTORIAL ANOVA
Daphnia.data<-read.table("c:\\MSc\\Statistics\\Data\\Daphnia.txt",header=T)
attach(Daphnia.data)
# calculate SST
head(Daphnia.data)
SST<-sum((Growth.rate-mean(Growth.rate))^2)
SST
x<-tapply(Growth.rate, list(Water,Detergent,Daphnia),mean)
x
sum(3*(x-mean(Growth.rate))^2)
# subtract the grand mean from each of these, square them, add them up and multiply by 3
factorial<-lm(Growth.rate~Water*Detergent*Daphnia)
anova(factorial)
# compare our answers
# * is used to specify the fit of all interaction terms
tapply(Growth.rate,list(Detergent,Daphnia),mean)
# calculate a 2-dimensional table of mean growth rates for each combination of Detergent and Daphnia clone
interaction.plot(Detergent,Daphnia,Growth.rate)
# shows the interaction effects between Detergent and Clone
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment