Created
August 16, 2013 12:23
-
-
Save rpietro/6249436 to your computer and use it in GitHub Desktop.
regression scripts from Dalgaard's book
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# regression models from dalgaard | |
library("ISwR") | |
?thuesen | |
thuesen | |
attach(thuesen) | |
lm(short.velocity~blood.glucose) | |
summary(lm(short.velocity~blood.glucose)) #the median should not be far from zero, and the minimum and maximum should be roughly equal in absolute value | |
plot(blood.glucose,short.velocity) | |
abline(lm(short.velocity~blood.glucose)) | |
lm.velo <- lm(short.velocity~blood.glucose) | |
fitted(lm.velo) | |
resid(lm.velo) | |
plot(blood.glucose,short.velocity) | |
lines(blood.glucose,fitted(lm.velo)) | |
lines(blood.glucose[!is.na(short.velocity)],fitted(lm.velo)) | |
cc <- complete.cases(thuesen) | |
options(na.action=na.exclude) | |
lm.velo <- lm(short.velocity~blood.glucose) | |
fitted(lm.velo) | |
?segments | |
segments(blood.glucose, fitted(lm.velo), blood.glucose, short.velocity) | |
plot(fitted(lm.velo),resid(lm.velo)) | |
qqnorm(resid(lm.velo)) | |
predict(lm.velo) | |
?predict | |
predict(lm.velo, int="confidence") | |
#definition confidence interval: if confidence intervals are constructed across many separate data analyses of repeated (and possibly different) experiments, the proportion of such intervals that contain the true value of the parameter will match the confidence level; this is guaranteed by the reasoning underlying the construction of confidence intervals | |
pred.frame <- data.frame(blood.glucose=1:24) | |
head(pred.frame) | |
head(lm.velo) | |
pp <- predict(lm.velo, int="p", newdata=pred.frame) | |
pc <- predict(lm.velo, int="c", newdata=pred.frame) | |
plot(blood.glucose,short.velocity, ylim=range(short.velocity, pp, na.rm=T)) | |
pred.gluc <- pred.frame$blood.glucose | |
?matlines | |
matlines(pred.gluc, pc, lty=c(2,3,3), col="red") | |
matlines(pred.gluc, pp, lty=c(1,3,3), col="blue") | |
# multiple regression | |
?par | |
par(mex=0.5) #‘mex’ is a character size expansion factor which is used to | |
# describe coordinates in the margins of plots. Note that this | |
# does not change the font size, rather specifies the size of | |
# font (as a multiple of ‘csi’) used to convert between ‘mar’ | |
# and ‘mai’, and between ‘oma’ and ‘omi’. | |
pairs(cystfibr, gap=0, cex.labels=0.7) #gap: distance between subplots, in margin lines. cex.labels, font.labels: graphics parameters for the text panel. | |
attach(cystfibr) | |
lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc) | |
summary(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)) | |
anova(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)) # these tests are successive; they correspond to (reading upward from the bottom) a stepwise removal of terms from the model until finally only age is left. | |
m1<-lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc) | |
m2<-lm(pemax~age) | |
anova(m1,m2) | |
# > 955.4+155.0+632.3+2862.2+1549.1+561.9+194.6+92.4 | |
# [1] 7002.9 | |
# > 7002.9/8 | |
# [1] 875.3625 | |
# > 875.36/648.7 | |
# [1] 1.349407 | |
# > 1-pf(1.349407,8,15) | |
# [1] 0.2935148 | |
# polynomial regression | |
attach(cystfibr) | |
summary(lm(pemax~height+I(height^2))) | |
pred.frame <- data.frame(height=seq(110,180,2)) | |
lm.pemax.hq <- lm(pemax~height+I(height^2)) | |
predict(lm.pemax.hq,interval="pred",newdata=pred.frame) | |
pp <- predict(lm.pemax.hq,newdata=pred.frame,interval="pred") | |
pc <- predict(lm.pemax.hq,newdata=pred.frame,interval="conf") | |
plot(height,pemax,ylim=c(0,200)) | |
matlines(pred.frame$height,pp,lty=c(1,2,2),col="black") | |
matlines(pred.frame$height,pc,lty=c(1,3,3),col="black") | |
# regression through the origin | |
?runif | |
x <- runif(20) | |
x | |
?rnorm | |
y <- 2*x+rnorm(20,0,0.3) | |
y | |
summary(lm(y~x)) | |
summary(lm(y~x-1)) | |
anova(lm(y~x)) | |
# design matrices | |
# If you add the columns together, weighted by the corresponding regression coefficients, you get the fitted values. the intercept enters as the coefficient to a column of ones. | |
model.matrix(pemax~height+weight) | |
attach(red.cell.folate) | |
model.matrix(folate~ventilation) #for a model with factors then you have dummy variables | |
# diagnostics | |
attach(thuesen) | |
options(na.action="na.exclude") | |
lm.velo <- lm(short.velocity~blood.glucose) | |
opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3) | |
plot(lm.velo, which=1:4) # The third plot is of the square root of the absolute value of the standardized residuals; this reduces the skewness of the distribution and makes it much easier to detect if there might be a trend in the dispersion. The fourth plot is of “Cook’s distance”, which is a measure of the influence of each observation on the regression coefficients | |
par(opar) | |
opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3) | |
plot(rstandard(lm.velo)) # standardized residuals since residuals corresponding to extreme x-values generally have a lower SD due to overfitting | |
plot(rstudent(lm.velo)) | |
plot(dffits(lm.velo),type="l") #how much an observation affects the as- sociated fitted value | |
matplot(dfbetas(lm.velo),type="l", col="red") #change in the estimated parameters if an observation is excluded relative to its standard error. | |
lines(sqrt(cooks.distance(lm.velo)), lwd=2) | |
par(opar) | |
summary(lm(short.velocity~blood.glucose, subset=-13)) | |
# logistic regression | |
no.yes <- c("No","Yes") | |
?gl | |
smoking <- gl(2,1,8,no.yes) | |
obesity <- gl(2,2,8,no.yes) | |
snoring <- gl(2,4,8,no.yes) | |
n.tot <- c(60,17,8,2,187,85,51,23) | |
n.hyp <- c(5,2,1,0,35,13,15,8) | |
data.frame(smoking,obesity,snoring,n.tot,n.hyp) | |
expand.grid(smoking=no.yes, obesity=no.yes, snoring=no.yes) | |
hyp.tbl <- cbind(n.hyp,n.tot-n.hyp) | |
hyp.tbl | |
glm(hyp.tbl ~ smoking + obesity + snoring, family=binomial("logit")) | |
glm(hyp.tbl ~ smoking + obesity + snoring, binomial) | |
prop.hyp <- n.hyp/n.tot | |
glm.hyp <- glm(prop.hyp~smoking+obesity+snoring,binomial,weights=n.tot) # The other way to specify a logistic regression model is to give the proportion of diseased in each cell. It is necessary to give weights because R cannot see how many observations a proportion is based on. | |
glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial) | |
summary(glm.hyp) | |
glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial) | |
summary(glm.hyp) | |
glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial) | |
anova(glm.hyp, test="Chisq") | |
glm.hyp <- glm(hyp.tbl~snoring+obesity+smoking,binomial) | |
anova(glm.hyp, test="Chisq") | |
glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial) | |
anova(glm.hyp, test="Chisq") | |
drop1(glm.hyp, test="Chisq") | |
exp(cbind(OR=coef(glm.hyp), confint(glm.hyp))) | |
library("ISwR") | |
juul$menarche <- factor(juul$menarche, labels=c("No","Yes")) | |
juul$tanner <- factor(juul$tanner) | |
juul.girl <- subset(juul,age>8 & age<20 & complete.cases(menarche)) | |
attach(juul.girl) | |
summary(glm(menarche ~ age,binomial)) | |
summary(glm(menarche ~ age+tanner,binomial)) | |
drop1(glm(menarche ~ age+tanner,binomial),test="Chisq") | |
predict(glm.hyp) | |
predict(glm.hyp, type="response") #obtain probabilities | |
plot(age, fitted(glm(menarche~age,binomial))) | |
glm.menarche <- glm(menarche~age, binomial) | |
Age <- seq(8,20,.1) | |
newages <- data.frame(age=Age) | |
newages | |
predicted.probability <- predict(glm.menarche, newages, type="resp") | |
plot(predicted.probability ~ Age, type="l") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment