nickpettican · October 28, 2019 05:49
diff --git a/stats05.R b/stats05.R
 # ANOVA: ANALYSIS OF VARIANCE

 # Regression and ANOVA are identical approaches except for the nature of the explanatory variables

 # STATISTICAL BACKGORUND

 # The emphasis of ANOVA is on hypothesis testing
 # To estimate means and standard errors of differences between means
 # Comparing two means by t-test: (difference between two means)/standard error of the difference -> compare with critical value
 # In ANOVA we compare 3 or more means

 # e.g.

 # SST (total variation overall mean value of y) = sum of the squares of the differences
 # SSE (error sum of squares) = sum of squares o fthe differences between each y value and its own treatment mean
 # if the two means were the same SSE = SST
 # if they are different => treatment sum of squares => SSA = SST - SSE
 # to analyse variance => convert the sums of squares into variances by dividing by their degrees of freedom
 # we might have k levels of any factor hence k-1 degrees of freedom for SSA
 # if each factor level were replicated n times there would be n-1 d.f. for error within each level
 # so k(n-1) d.f. for error in the whole
 # Treatment - SSA - k-1 d.f.
 # Error - SSE - k(n-1) d.f.
 # Total - SST - kn-1 d.f.

 # each element in the sums of squares is divided by d.f. to give variances in the mean square
 # significance is assessed using F test 
 # treatment variance MSA is divided by error variance s^2 -> converted to a p value using pf
 # if p value < significance threshold (0.05) we REJECT the null hypothesis
 # if p value > significance threshold (0.05) -> could have arisen by chance, we ACCEPT the null hypothesis

 # ONE WAY ANOVA

 oneway<-read.table("c:\\MSc\\Statistics\\Data\\oneway.txt",header=T)
 attach(oneway)
 head(oneway)

 tapply(Growth,Photoperiod,mean)
 # calculate the mean growth at each photoperiod

 Photoperiod<-ordered(Photoperiod,levels=c("Very.short","Short","Long","Very.long")) 
 # declare photoperiod to be an ordered factor

 tapply(Growth,Photoperiod,mean) 

 mean(Growth)
 # the grand mean

 SST<-sum((Growth-mean(Growth))^2)
 SST
 # to work out the total sum of squares

 tapply(Growth,Photoperiod,length)
 rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6)
 # to se the values of yi − y

 SSA<-sum(rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6)^2)
 SSA
 # now we have SSA

 Dvs<-Growth[Photoperiod=="Very.short"]-mean(Growth[Photoperiod=="Very.short"])
 Dvs
 Ds<-Growth[Photoperiod=="Short"]-mean(Growth[Photoperiod=="Short"])
 Ds
 Dl<-Growth[Photoperiod=="Long"]-mean(Growth[Photoperiod=="Long"])
 Dl
 Dvl<-Growth[Photoperiod=="Very.long"]-mean(Growth[Photoperiod=="Very.long"])
 Dvl
 # these calculate the residuals: the differences between the observed growth and the average for that photoperiod

 sum(Dvs^2)
 sum(Ds^2)
 sum(Dl^2)
 sum(Dvl^2)
 SSE<-3.333333+6.833333+9.5+12.83333
 SSE
 # sums of squares of these differences

 SST-SSA
 # a quicker way of working out SSE

 # R has a quicker way of doing a one-way analysis of variance using "lm" 
 Response.variable~Explanatory.variable
 # the explanatory variable is a factor

 # to check if a variable is a factor we use
 is.factor(Photoperiod)

 one.way<-lm(Growth~Photoperiod) 
 anova(one-way)
 # this carries out the ANOVA test once we know if the variable is a factor

 summary(one.way)
 # can also be used on "lm" models

 # TWO-WAY ANOVA

 rm(Photoperiod)
 detach(oneway) 

 twoway<-read.table("c:\\MSc\\Statistics\\Data\\twoway.txt",header=T)
 attach(twoway)
 head(twoway)

 tapply(Growth,Genotype,mean) 
 # compares the mean growth rates of 6 genotypes

 tapply(Growth,Genotype,length)
 # removes the variation attributable to Genotype, makes the error variance smaller and difference between Photoperiod means greater

 SSB<-sum(rep(tapply(Growth,Genotype,mean)-mean(Growth),4)^2)
 SSB
 # now we can begin the two-way ANOVA

 # THE EASY WAY

 two.way<-lm(Growth~Genotype+Photoperiod) 
 anova(two.way) 
 # includes Genotype in the model
 # the difference between the Photoperiod means is now highly significant

 # ASSUMPTIONS OF ANOVA 
 # random sampling, equal variances, independence of errors, normal distribution of errors, additivity of treatment effects

 # RANDOM SAMPLING
 # sampling needs to be done randomly otherwise they will probably be biased and their interpretation is bound to be equivocal

 # EQUAL VARIANCES
 # ANOVA assumes that the sampling errors do not differ significantly from one treatment to another

 # TWO-WAY FACTORIAL ANALYSIS OF VARIANCE

 factorial<-read.table("c:\\MSc\\Statistics\\Data\\factorial.txt",header=T)
 attach(factorial)
 factorial 
 # calculation of the interaction sum of squares

 tapply(growth,diet,mean)
 tapply(growth,diet,length)
 # working out SSA, sum of squares for diet

 as.vector(tapply(growth,diet,mean))-mean(growth)
 # differences between the overall mean and the means on the different diets

 4*sum((as.vector(tapply(growth,diet,mean))-mean(growth))^2)
 # sum of squares of these differences for all 12 individuals

 tapply(growth,coat,mean)
 tapply(growth,coat,length)
 as.vector(tapply(growth,coat,mean))-mean(growth)
 6*sum((as.vector(tapply(growth,coat,mean))-mean(growth))^2)
 # works out SSB, sum of squares for coat

 tapply(growth,list(coat,diet),mean)
 tapply(growth,list(coat,diet),length)
 # work out the mean for each paired replicate with the same diet and coat colour and the number of replicates

 tapply(growth,list(coat,diet),mean)-mean(growth)
 # difference between these means and the overall mean growth

 2*sum((as.vector(tapply(growth,list(coat,diet),mean))-mean(growth))^2)-2.613333-2.66
 # subtract the sum of squares for the main effects (SSA and SSB) from the sum of squares of this bigger difference to get SSAB

 # TWO-WAY FACTORIAL ANALYSIS OF VARIANCE IN R

 attach(factorial)
 model<-lm(growth~diet*coat)
 anova(model) 

 model2<-update(model , ~ . - diet:coat) 
 # model simplification to remove any non-significant parameters
 # the new model (model2) gets an update of the earlier model
 # ,~.- means to take the whole of "model" and remove from it the interaction diet:coat

 anova(model,model2) 
 # model2 is not significantly different in its explanatory power than the more complicated model

 anova(model2)
 # hint of an effect of diet (p=0.0719)

 model3<-update(model2, ~. -diet)
 anova(model2,model3) 
 # gives the same p value

 anova(model3) 
 # when diet is left out of the model the effect of coat is insignificant

 tapply(growth,diet,mean) 
 # diets A and B produce a similar response, but diet C seems to have more effect

 diet2<-factor(1+(diet=="C"))
 # all diets are represented as 1 except diet C -> 2
 model4<-update(model3, ~. +diet2) 
 anova(model3,model4) 
 # by reducing the levels of diet from 3 to 2 has turned a non-significant effect into a significant one

 model5<-update(model4, ~. +diet2:coat)
 anova(model4,model5) 
 # there is no interaction with the new simpler factor
 # model 4 is the minimal adequate model

 # THREE-WAY FACTORIAL ANOVA

 Daphnia.data<-read.table("c:\\MSc\\Statistics\\Data\\Daphnia.txt",header=T) 
 attach(Daphnia.data) 

 # calculate SST

 head(Daphnia.data) 
 SST<-sum((Growth.rate-mean(Growth.rate))^2)
 SST
 x<-tapply(Growth.rate, list(Water,Detergent,Daphnia),mean)
 x
 sum(3*(x-mean(Growth.rate))^2)
 # subtract the grand mean from each of these, square them, add them up and multiply by 3

 factorial<-lm(Growth.rate~Water*Detergent*Daphnia)
 anova(factorial) 
 # compare our answers
 # * is used to specify the fit of all interaction terms

 tapply(Growth.rate,list(Detergent,Daphnia),mean) 
 # calculate a 2-dimensional table of mean growth rates for each combination of Detergent and Daphnia clone

 interaction.plot(Detergent,Daphnia,Growth.rate) 
 # shows the interaction effects between Detergent and Clone
	# ANOVA: ANALYSIS OF VARIANCE

	# Regression and ANOVA are identical approaches except for the nature of the explanatory variables

	# STATISTICAL BACKGORUND

	# The emphasis of ANOVA is on hypothesis testing
	# To estimate means and standard errors of differences between means
	# Comparing two means by t-test: (difference between two means)/standard error of the difference -> compare with critical value
	# In ANOVA we compare 3 or more means

	# e.g.

	# SST (total variation overall mean value of y) = sum of the squares of the differences
	# SSE (error sum of squares) = sum of squares o fthe differences between each y value and its own treatment mean
	# if the two means were the same SSE = SST
	# if they are different => treatment sum of squares => SSA = SST - SSE
	# to analyse variance => convert the sums of squares into variances by dividing by their degrees of freedom
	# we might have k levels of any factor hence k-1 degrees of freedom for SSA
	# if each factor level were replicated n times there would be n-1 d.f. for error within each level
	# so k(n-1) d.f. for error in the whole
	# Treatment - SSA - k-1 d.f.
	# Error - SSE - k(n-1) d.f.
	# Total - SST - kn-1 d.f.

	# each element in the sums of squares is divided by d.f. to give variances in the mean square
	# significance is assessed using F test
	# treatment variance MSA is divided by error variance s^2 -> converted to a p value using pf
	# if p value < significance threshold (0.05) we REJECT the null hypothesis
	# if p value > significance threshold (0.05) -> could have arisen by chance, we ACCEPT the null hypothesis

	# ONE WAY ANOVA

	oneway<-read.table("c:\\MSc\\Statistics\\Data\\oneway.txt",header=T)
	attach(oneway)
	head(oneway)

	tapply(Growth,Photoperiod,mean)
	# calculate the mean growth at each photoperiod

	Photoperiod<-ordered(Photoperiod,levels=c("Very.short","Short","Long","Very.long"))
	# declare photoperiod to be an ordered factor

	tapply(Growth,Photoperiod,mean)

	mean(Growth)
	# the grand mean

	SST<-sum((Growth-mean(Growth))^2)
	SST
	# to work out the total sum of squares

	tapply(Growth,Photoperiod,length)
	rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6)
	# to se the values of yi − y

	SSA<-sum(rep(tapply(Growth,Photoperiod,mean)-mean(Growth),6)^2)
	SSA
	# now we have SSA

	Dvs<-Growth[Photoperiod=="Very.short"]-mean(Growth[Photoperiod=="Very.short"])
	Dvs
	Ds<-Growth[Photoperiod=="Short"]-mean(Growth[Photoperiod=="Short"])
	Ds
	Dl<-Growth[Photoperiod=="Long"]-mean(Growth[Photoperiod=="Long"])
	Dl
	Dvl<-Growth[Photoperiod=="Very.long"]-mean(Growth[Photoperiod=="Very.long"])
	Dvl
	# these calculate the residuals: the differences between the observed growth and the average for that photoperiod

	sum(Dvs^2)
	sum(Ds^2)
	sum(Dl^2)
	sum(Dvl^2)
	SSE<-3.333333+6.833333+9.5+12.83333
	SSE
	# sums of squares of these differences

	SST-SSA
	# a quicker way of working out SSE

	# R has a quicker way of doing a one-way analysis of variance using "lm"
	Response.variable~Explanatory.variable
	# the explanatory variable is a factor

	# to check if a variable is a factor we use
	is.factor(Photoperiod)

	one.way<-lm(Growth~Photoperiod)
	anova(one-way)
	# this carries out the ANOVA test once we know if the variable is a factor

	summary(one.way)
	# can also be used on "lm" models

	# TWO-WAY ANOVA

	rm(Photoperiod)
	detach(oneway)

	twoway<-read.table("c:\\MSc\\Statistics\\Data\\twoway.txt",header=T)
	attach(twoway)
	head(twoway)

	tapply(Growth,Genotype,mean)
	# compares the mean growth rates of 6 genotypes

	tapply(Growth,Genotype,length)
	# removes the variation attributable to Genotype, makes the error variance smaller and difference between Photoperiod means greater

	SSB<-sum(rep(tapply(Growth,Genotype,mean)-mean(Growth),4)^2)
	SSB
	# now we can begin the two-way ANOVA

	# THE EASY WAY

	two.way<-lm(Growth~Genotype+Photoperiod)
	anova(two.way)
	# includes Genotype in the model
	# the difference between the Photoperiod means is now highly significant

	# ASSUMPTIONS OF ANOVA
	# random sampling, equal variances, independence of errors, normal distribution of errors, additivity of treatment effects

	# RANDOM SAMPLING
	# sampling needs to be done randomly otherwise they will probably be biased and their interpretation is bound to be equivocal

	# EQUAL VARIANCES
	# ANOVA assumes that the sampling errors do not differ significantly from one treatment to another

	# TWO-WAY FACTORIAL ANALYSIS OF VARIANCE

	factorial<-read.table("c:\\MSc\\Statistics\\Data\\factorial.txt",header=T)
	attach(factorial)
	factorial
	# calculation of the interaction sum of squares

	tapply(growth,diet,mean)
	tapply(growth,diet,length)
	# working out SSA, sum of squares for diet

	as.vector(tapply(growth,diet,mean))-mean(growth)
	# differences between the overall mean and the means on the different diets

	4*sum((as.vector(tapply(growth,diet,mean))-mean(growth))^2)
	# sum of squares of these differences for all 12 individuals

	tapply(growth,coat,mean)
	tapply(growth,coat,length)
	as.vector(tapply(growth,coat,mean))-mean(growth)
	6*sum((as.vector(tapply(growth,coat,mean))-mean(growth))^2)
	# works out SSB, sum of squares for coat

	tapply(growth,list(coat,diet),mean)
	tapply(growth,list(coat,diet),length)
	# work out the mean for each paired replicate with the same diet and coat colour and the number of replicates

	tapply(growth,list(coat,diet),mean)-mean(growth)
	# difference between these means and the overall mean growth

	2*sum((as.vector(tapply(growth,list(coat,diet),mean))-mean(growth))^2)-2.613333-2.66
	# subtract the sum of squares for the main effects (SSA and SSB) from the sum of squares of this bigger difference to get SSAB

	# TWO-WAY FACTORIAL ANALYSIS OF VARIANCE IN R

	attach(factorial)
	model<-lm(growth~diet*coat)
	anova(model)

	model2<-update(model , ~ . - diet:coat)
	# model simplification to remove any non-significant parameters
	# the new model (model2) gets an update of the earlier model
	# ,~.- means to take the whole of "model" and remove from it the interaction diet:coat

	anova(model,model2)
	# model2 is not significantly different in its explanatory power than the more complicated model

	anova(model2)
	# hint of an effect of diet (p=0.0719)

	model3<-update(model2, ~. -diet)
	anova(model2,model3)
	# gives the same p value

	anova(model3)
	# when diet is left out of the model the effect of coat is insignificant

	tapply(growth,diet,mean)
	# diets A and B produce a similar response, but diet C seems to have more effect

	diet2<-factor(1+(diet=="C"))
	# all diets are represented as 1 except diet C -> 2
	model4<-update(model3, ~. +diet2)
	anova(model3,model4)
	# by reducing the levels of diet from 3 to 2 has turned a non-significant effect into a significant one

	model5<-update(model4, ~. +diet2:coat)
	anova(model4,model5)
	# there is no interaction with the new simpler factor
	# model 4 is the minimal adequate model

	# THREE-WAY FACTORIAL ANOVA

	Daphnia.data<-read.table("c:\\MSc\\Statistics\\Data\\Daphnia.txt",header=T)
	attach(Daphnia.data)

	# calculate SST

	head(Daphnia.data)
	SST<-sum((Growth.rate-mean(Growth.rate))^2)
	SST
	x<-tapply(Growth.rate, list(Water,Detergent,Daphnia),mean)
	x
	sum(3*(x-mean(Growth.rate))^2)
	# subtract the grand mean from each of these, square them, add them up and multiply by 3

	factorial<-lm(Growth.rate~WaterDetergentDaphnia)
	anova(factorial)
	# compare our answers
	# * is used to specify the fit of all interaction terms

	tapply(Growth.rate,list(Detergent,Daphnia),mean)
	# calculate a 2-dimensional table of mean growth rates for each combination of Detergent and Daphnia clone

	interaction.plot(Detergent,Daphnia,Growth.rate)
	# shows the interaction effects between Detergent and Clone