Last active
October 28, 2019 05:49
-
-
Save nickpettican/cac4be02e8da391ad159 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ANOVA: ANALYSIS OF VARIANCE | |
# Regression and ANOVA are identical approaches except for the nature of the explanatory variables | |
# STATISTICAL BACKGORUND | |
# The emphasis of ANOVA is on hypothesis testing | |
# To estimate means and standard errors of differences between means | |
# Comparing two means by t-test: (difference between two means)/standard error of the difference -> compare with critical value | |
# In ANOVA we compare 3 or more means | |
# e.g. | |
# SST (total variation overall mean value of y) = sum of the squares of the differences | |
# SSE (error sum of squares) = sum of squares o fthe differences between each y value and its own treatment mean | |
# if the two means were the same SSE = SST | |
# if they are different => treatment sum of squares => SSA = SST - SSE | |
# to analyse variance => convert the sums of squares into variances by dividing by their degrees of freedom | |
# we might have k levels of any factor hence k-1 degrees of freedom for SSA | |
# if each factor level were replicated n times there would be n-1 d.f. for error within each level | |
# so k(n-1) d.f. for error in the whole | |
# Treatment - SSA - k-1 d.f. | |
# Error - SSE - k(n-1) d.f. | |
# Total - SST - kn-1 d.f. | |
# each element in the sums of squares is divided by d.f. to give variances in the mean square | |
# significance is assessed using F test | |
# treatment variance MSA is divided by error variance s^2 -> converted to a p value using pf | |
# if p value < significance threshold (0.05) we REJECT the null hypothesis | |
# if p value > significance threshold (0.05) -> could have arisen by chance, we ACCEPT the null hypothesis | |
# ONE WAY ANOVA | |
oneway<-read.table("c:\\MSc\\Statistics\\Data\\oneway.txt",header=T) | |
attach(oneway) | |
head(oneway) | |
tapply(Growth,Photoperiod,mean) | |
# calculate the mean growth at each photoperiod | |
Photoperiod<-ordered(Photoperiod,levels=c("Very.short","Short","Long","Very.long")) | |
# declare photoperiod to be an ordered factor | |
tapply(Growth,Photoperiod,mean) | |
mean(Growth) | |
# the grand mean | |
SST<-sum((Growth-mean(Growth))^2) | |
SST | |
# to work out the total sum of squares | |
tapply(Growth,Photoperiod,length) | |
rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6) | |
# to se the values of yi − y | |
SSA<-sum(rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6)^2) | |
SSA | |
# now we have SSA | |
Dvs<-Growth[Photoperiod=="Very.short"]-mean(Growth[Photoperiod=="Very.short"]) | |
Dvs | |
Ds<-Growth[Photoperiod=="Short"]-mean(Growth[Photoperiod=="Short"]) | |
Ds | |
Dl<-Growth[Photoperiod=="Long"]-mean(Growth[Photoperiod=="Long"]) | |
Dl | |
Dvl<-Growth[Photoperiod=="Very.long"]-mean(Growth[Photoperiod=="Very.long"]) | |
Dvl | |
# these calculate the residuals: the differences between the observed growth and the average for that photoperiod | |
sum(Dvs^2) | |
sum(Ds^2) | |
sum(Dl^2) | |
sum(Dvl^2) | |
SSE<-3.333333+6.833333+9.5+12.83333 | |
SSE | |
# sums of squares of these differences | |
SST-SSA | |
# a quicker way of working out SSE | |
# R has a quicker way of doing a one-way analysis of variance using "lm" | |
Response.variable~Explanatory.variable | |
# the explanatory variable is a factor | |
# to check if a variable is a factor we use | |
is.factor(Photoperiod) | |
one.way<-lm(Growth~Photoperiod) | |
anova(one-way) | |
# this carries out the ANOVA test once we know if the variable is a factor | |
summary(one.way) | |
# can also be used on "lm" models | |
# TWO-WAY ANOVA | |
rm(Photoperiod) | |
detach(oneway) | |
twoway<-read.table("c:\\MSc\\Statistics\\Data\\twoway.txt",header=T) | |
attach(twoway) | |
head(twoway) | |
tapply(Growth,Genotype,mean) | |
# compares the mean growth rates of 6 genotypes | |
tapply(Growth,Genotype,length) | |
# removes the variation attributable to Genotype, makes the error variance smaller and difference between Photoperiod means greater | |
SSB<-sum(rep(tapply(Growth,Genotype,mean)-mean(Growth),4)^2) | |
SSB | |
# now we can begin the two-way ANOVA | |
# THE EASY WAY | |
two.way<-lm(Growth~Genotype+Photoperiod) | |
anova(two.way) | |
# includes Genotype in the model | |
# the difference between the Photoperiod means is now highly significant | |
# ASSUMPTIONS OF ANOVA | |
# random sampling, equal variances, independence of errors, normal distribution of errors, additivity of treatment effects | |
# RANDOM SAMPLING | |
# sampling needs to be done randomly otherwise they will probably be biased and their interpretation is bound to be equivocal | |
# EQUAL VARIANCES | |
# ANOVA assumes that the sampling errors do not differ significantly from one treatment to another | |
# TWO-WAY FACTORIAL ANALYSIS OF VARIANCE | |
factorial<-read.table("c:\\MSc\\Statistics\\Data\\factorial.txt",header=T) | |
attach(factorial) | |
factorial | |
# calculation of the interaction sum of squares | |
tapply(growth,diet,mean) | |
tapply(growth,diet,length) | |
# working out SSA, sum of squares for diet | |
as.vector(tapply(growth,diet,mean))-mean(growth) | |
# differences between the overall mean and the means on the different diets | |
4*sum((as.vector(tapply(growth,diet,mean))-mean(growth))^2) | |
# sum of squares of these differences for all 12 individuals | |
tapply(growth,coat,mean) | |
tapply(growth,coat,length) | |
as.vector(tapply(growth,coat,mean))-mean(growth) | |
6*sum((as.vector(tapply(growth,coat,mean))-mean(growth))^2) | |
# works out SSB, sum of squares for coat | |
tapply(growth,list(coat,diet),mean) | |
tapply(growth,list(coat,diet),length) | |
# work out the mean for each paired replicate with the same diet and coat colour and the number of replicates | |
tapply(growth,list(coat,diet),mean)-mean(growth) | |
# difference between these means and the overall mean growth | |
2*sum((as.vector(tapply(growth,list(coat,diet),mean))-mean(growth))^2)-2.613333-2.66 | |
# subtract the sum of squares for the main effects (SSA and SSB) from the sum of squares of this bigger difference to get SSAB | |
# TWO-WAY FACTORIAL ANALYSIS OF VARIANCE IN R | |
attach(factorial) | |
model<-lm(growth~diet*coat) | |
anova(model) | |
model2<-update(model , ~ . - diet:coat) | |
# model simplification to remove any non-significant parameters | |
# the new model (model2) gets an update of the earlier model | |
# ,~.- means to take the whole of "model" and remove from it the interaction diet:coat | |
anova(model,model2) | |
# model2 is not significantly different in its explanatory power than the more complicated model | |
anova(model2) | |
# hint of an effect of diet (p=0.0719) | |
model3<-update(model2, ~. -diet) | |
anova(model2,model3) | |
# gives the same p value | |
anova(model3) | |
# when diet is left out of the model the effect of coat is insignificant | |
tapply(growth,diet,mean) | |
# diets A and B produce a similar response, but diet C seems to have more effect | |
diet2<-factor(1+(diet=="C")) | |
# all diets are represented as 1 except diet C -> 2 | |
model4<-update(model3, ~. +diet2) | |
anova(model3,model4) | |
# by reducing the levels of diet from 3 to 2 has turned a non-significant effect into a significant one | |
model5<-update(model4, ~. +diet2:coat) | |
anova(model4,model5) | |
# there is no interaction with the new simpler factor | |
# model 4 is the minimal adequate model | |
# THREE-WAY FACTORIAL ANOVA | |
Daphnia.data<-read.table("c:\\MSc\\Statistics\\Data\\Daphnia.txt",header=T) | |
attach(Daphnia.data) | |
# calculate SST | |
head(Daphnia.data) | |
SST<-sum((Growth.rate-mean(Growth.rate))^2) | |
SST | |
x<-tapply(Growth.rate, list(Water,Detergent,Daphnia),mean) | |
x | |
sum(3*(x-mean(Growth.rate))^2) | |
# subtract the grand mean from each of these, square them, add them up and multiply by 3 | |
factorial<-lm(Growth.rate~Water*Detergent*Daphnia) | |
anova(factorial) | |
# compare our answers | |
# * is used to specify the fit of all interaction terms | |
tapply(Growth.rate,list(Detergent,Daphnia),mean) | |
# calculate a 2-dimensional table of mean growth rates for each combination of Detergent and Daphnia clone | |
interaction.plot(Detergent,Daphnia,Growth.rate) | |
# shows the interaction effects between Detergent and Clone |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment