Skip to content

Instantly share code, notes, and snippets.

@korkridake
Created November 30, 2018 11:24
Show Gist options
  • Save korkridake/3fd8bad35d25746bf86949b0a1c33b63 to your computer and use it in GitHub Desktop.
Save korkridake/3fd8bad35d25746bf86949b0a1c33b63 to your computer and use it in GitHub Desktop.
if (!require(car)){install.packages(car, dep=T)}
library(car)
###############
# #
# Exercise 1 #
# #
###############
# load data file
mussel<-read.csv(file.choose())
str(mussel)
## 'data.frame': 25 obs. of 3 variables:
## $ AREA : num 516 469 462 939 1357 ...
## $ SPECIES: int 3 7 6 8 10 9 10 11 16 9 ...
## $ INDIV : int 18 60 57 100 48 118 148 214 225 283 ...
head(mussel)
## AREA SPECIES INDIV
## 1 516.00 3 18
## 2 469.06 7 60
## 3 462.25 6 57
## 4 938.60 8 100
## 5 1357.15 10 48
## 6 1773.66 9 118
scatterplot(SPECIES ~ AREA, data = mussel)
# data are not normally distributed (especially AREA)
# The species richness peaked in the middle (not a good data)
###############
# #
# Exercise 2 #
# #
###############
#fit a linear model
mussel.lm <- lm(SPECIES ~ AREA, data = mussel)
summary(mussel.lm)
##
## Call:
## lm(formula = SPECIES ~ AREA, data = mussel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1964 -2.7521 -0.7509 1.2094 7.2148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.856e+00 1.044e+00 9.441 2.24e-09 ***
## AREA 6.593e-04 9.283e-05 7.102 3.10e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.759 on 23 degrees of freedom
## Multiple R-squared: 0.6868, Adjusted R-squared: 0.6732
## F-statistic: 50.44 on 1 and 23 DF, p-value: 3.102e-07
#plot the diagnostics of R inbuilt model validation plot defaults
op <- par(mfrow = c(2, 2))
plot(mussel.lm)
par(op) # turns graphics device back to default of one plot per page!
# FINAL VALIDATION TASK - residuals against explanatory variable
plot(mussel.lm$resid ~ mussel$AREA,
xlab = "Mussel bed Area",
ylab = "Residuals")
# indicates a few large values and a slight wedge due to numerous small patches
###############
# #
# Exercise 3 #
# #
###############
#fit a second order polynomial to the area variable
mussel.lm1 <- lm(SPECIES ~ AREA + I(AREA^2), data = mussel)
# I(AREA^2) fits the 2nd order polynomial
# results
summary(mussel.lm1)
##
## Call:
## lm(formula = SPECIES ~ AREA + I(AREA^2), data = mussel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7894 -1.0485 0.4301 1.3790 5.0666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.593e+00 1.156e+00 5.701 9.82e-06 ***
## AREA 1.745e-03 2.826e-04 6.174 3.26e-06 ***
## I(AREA^2) -4.118e-08 1.036e-08 -3.974 0.000643 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.933 on 22 degrees of freedom
## Multiple R-squared: 0.8177, Adjusted R-squared: 0.8011
## F-statistic: 49.34 on 2 and 22 DF, p-value: 7.393e-09
# validate the model
op <- par(mfrow = c(2, 2))
plot(mussel.lm1)
par(op)
# FINAL VALIDATION TASK - residuals against explanatory variable
plot(mussel.lm1$resid ~ mussel$AREA,
xlab = "Mussel bed Area",
ylab = "Residuals")
# Ntry to use a third order polynomial
mussel.lm2 <- lm(SPECIES ~ AREA + I(AREA^2) + I(AREA^3), data = mussel)
# I(AREA^2) fits the 2nd order polynomial
# I(AREA^3) fits the 3nd order polynomial
# check the results....
summary(mussel.lm2)
##
## Call:
## lm(formula = SPECIES ~ AREA + I(AREA^2) + I(AREA^3), data = mussel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0646 -0.7522 0.5169 1.5261 4.0841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.025e+00 1.338e+00 3.755 0.00117 **
## AREA 2.766e-03 5.750e-04 4.811 9.38e-05 ***
## I(AREA^2) -1.600e-07 6.016e-08 -2.660 0.01466 *
## I(AREA^3) 3.177e-12 1.587e-12 2.002 0.05843 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.751 on 21 degrees of freedom
## Multiple R-squared: 0.8469, Adjusted R-squared: 0.825
## F-statistic: 38.72 on 3 and 21 DF, p-value: 9.719e-09
# validate the model
op <- par(mfrow = c(2, 2))
plot(mussel.lm2)
par(op)
# FINAL VALIDATION TASK - residuals against explanatory variable
plot(mussel.lm2$resid ~ mussel$AREA,
xlab = "Mussel bed Area",
ylab = "Residuals")
# same result using the following code
mussel.lm3 <- lm(SPECIES ~ poly(AREA, 3), data = mussel)
# check the results....
summary(mussel.lm3)
##
## Call:
## lm(formula = SPECIES ~ poly(AREA, 3), data = mussel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0646 -0.7522 0.5169 1.5261 4.0841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.0000 0.5502 27.264 < 2e-16 ***
## poly(AREA, 3)1 26.7009 2.7509 9.706 3.26e-09 ***
## poly(AREA, 3)2 -11.6546 2.7509 -4.237 0.000369 ***
## poly(AREA, 3)3 5.5060 2.7509 2.002 0.058426 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.751 on 21 degrees of freedom
## Multiple R-squared: 0.8469, Adjusted R-squared: 0.825
## F-statistic: 38.72 on 3 and 21 DF, p-value: 9.719e-09
###############
# #
# Exercise 4 #
# #
###############
# We now have three models which one is the best?
# M0 is the poorest
# Distinct M1 and M2 using ANOVA and AIC to assess which model is most parimonious
anova(mussel.lm, mussel.lm1, mussel.lm2)
## Analysis of Variance Table
##
## Model 1: SPECIES ~ AREA
## Model 2: SPECIES ~ AREA + I(AREA^2)
## Model 3: SPECIES ~ AREA + I(AREA^2) + I(AREA^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 23 325.06
## 2 22 189.23 1 135.830 17.949 0.000369 ***
## 3 21 158.92 1 30.316 4.006 0.058426 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mussel.lm, mussel.lm1, mussel.lm2)
## df AIC
## mussel.lm 3 141.0756
## mussel.lm1 4 129.5497
## mussel.lm2 5 127.1848
# ANOVA and AIC indicate mussel.lm2 is the 'best' model
# Finalise the model and plot it....
# plot base graph (with CIs)
plot(SPECIES ~ AREA, data = mussel,
pch = 16,
xlab = (expression(paste("Mussel bed area ", "(", m^2, ")"))),
ylab = "Species Richness")
# predict across the data
x <- seq(min(mussel$AREA), max(mussel$AREA), l=1000) # 1000 steps....
y <- predict(mussel.lm2, data.frame(AREA = x), interval = "c")
matlines(x, y, lty = 1, col = 1)
# plot base graph (without CIs)
plot(SPECIES ~ AREA, data = mussel,
pch = 16,
xlab = (expression(paste("Mussel bed area ", "(", m^2, ")"))),
ylab = "Species Richness")
# predict across the data
x <- seq(min(mussel$AREA), max(mussel$AREA), l=1000) # 1000 steps....
points(x, predict(mussel.lm2, data.frame(AREA = x)), type = "l")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment