Created
May 11, 2017 15:01
-
-
Save sebnyberg/0b4d83c97334227f350587a7dc2d7b61 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ======================================== | |
# ==============[ Q 3.2.1 ]=============== | |
# ======================================== | |
load('hilda.Rdata') | |
source('helpers.R') | |
hilda <- setup_data_base(hilda) | |
# form the first model by splitting the ages into groups of five | |
hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85,90,95)) | |
# compare number of samples in the intervals | |
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10), | |
main="Number of people per age interval", | |
xlab="Age interval") | |
# save | |
pdf(file="3_2_1-hist-10categories.pdf") | |
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10), | |
main="Number of people per age interval", | |
xlab="Age interval") | |
dev.off() | |
# merge last two categories for a more even distribution of data | |
hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85)) | |
# compare number of samples in the intervals | |
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8), | |
main="Number of people per age interval", | |
xlab="Age interval") | |
# save | |
pdf(file="3_2_1-hist-8categories.pdf") | |
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8), | |
main="Number of people per age interval", | |
xlab="Age interval") | |
dev.off() | |
# rename the factors.. | |
hilda$age.interval <- factor(hilda$age.interval, | |
labels=c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84","85+")) | |
# make a list of failures | |
hilda$no.hospdays <- 1 - hilda$hospdays | |
# looks alright.. compare the values of the different proportions | |
# let's "aggregate" the values of x1, success, fails, sum_p according to the remaining variables | |
# which are x2 and gr. Then the summation function is applied to aggregated values. | |
agg<-aggregate(hilda[,c("hospdays","no.hospdays")],by=list(age.interval=hilda$age.interval),FUN=sum) | |
agg$total_events <- agg$hospdays + agg$no.hospdays | |
# find median for our categories | |
i = 1 | |
agg$xmid <- NA | |
for (interval in levels(hilda$age.interval)) { | |
agg$xmid[i] <- median(hilda$age[hilda$age.interval == interval]) | |
i = i + 1 | |
} | |
# calculate proportions | |
agg$prop<-agg$hospdays/agg$total_events | |
# plot the proportions | |
plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8), | |
type="l", xlab = "age", ylab = "probability of spending at least one hospital day") | |
# save | |
pdf("3_2_1-proportions.pdf") | |
plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8), | |
type="l", xlab = "age", ylab = "probability of spending at least one hospital day") | |
# ======================================== | |
# ==============[ Q 3.2.2 ]=============== | |
# ======================================== | |
# create the continuous model | |
model.age <- glm(hilda$hospdays ~ hilda$age, family = binomial) | |
# create a continuous model with a second order term | |
age <- hilda$age | |
age.sq <- hilda$age^2 | |
model.age.trans <- glm(hilda$hospdays ~ age + age.sq^2, family = binomial) | |
# in order to plot we make predictions for a sequence of x-values | |
x1 <- seq(50,100,1) | |
odds.trans <- get_log_odds(model.age.trans, cbind(x1, x1^2)) | |
odds.cont <- get_log_odds(model.age, cbind(x1)) | |
probs.trans <- odds.trans / (1 + odds.trans) | |
probs.cont <- odds.cont / (1 + odds.cont) | |
# plot proportions vs continous without transformation | |
plot(x1, probs.cont, | |
type="l", | |
ylim = c(0.05,0.40), | |
xlim = c(50,100), | |
xlab = "age", | |
ylab = "probability of spending a hospital day", | |
main="Proportions of categorical age\n vs probabilities of continuous age") | |
lines(agg$xmid, agg$prop, lty=2, col="blue") | |
points(agg$xmid, agg$prop, pch=8, col="blue") | |
# save to file | |
pdf(file="3_2_2-no-transformation.pdf") | |
plot(x1, probs.cont, | |
type="l", | |
ylim = c(0.05,0.40), | |
xlim = c(50,100), | |
xlab = "age", | |
ylab = "probability of spending a hospital day", | |
main="Proportions of categorical age\n vs probabilities of continuous age") | |
lines(agg$xmid, agg$prop, lty=2, col="blue") | |
points(agg$xmid, agg$prop, pch=8, col="blue") | |
dev.off() | |
# plot proportions vs our continuous logistic regression | |
plot(x1, probs.trans, | |
type="l", | |
ylim = c(0.05,0.30), | |
xlim = c(50,100), | |
xlab = "age", | |
ylab = "probability of spending a hospital day", | |
main="Proportions of categorical age\nvs probabilities of continuous age (transformed)") | |
lines(agg$xmid, agg$prop, lty=2, col="blue") | |
points(agg$xmid, agg$prop, pch=8, col="blue") | |
# save to file | |
pdf(file="3_2_2-with-transformation.pdf") | |
plot(x1, probs.trans, | |
type="l", | |
ylim = c(0.05,0.30), | |
xlim = c(50,100), | |
xlab = "age", | |
ylab = "probability of spending a hospital day", | |
main="Proportions of categorical age\nvs probabilities of continuous age (transformed)") | |
lines(agg$xmid, agg$prop, lty=2, col="blue") | |
points(agg$xmid, agg$prop, pch=8, col="blue") | |
dev.off() | |
model.current <- model.age.trans | |
# ======================================== | |
# ==============[ Q 3.2.3 ]=============== | |
# ======================================== | |
# Hack... get the se.fit from the predict function and find someone who is 70 years old.. | |
index.70yo <- min(which(hilda$age == 70)) | |
xbhat <- predict(model.age.trans, se.fit = T) | |
se.70yo <- xbhat$se.fit[index.70yo] | |
summary(model.age.trans) | |
lnodds.70yo <- get_log_odds(model.age.trans, c(1, 70, 70^2)) | |
probs.70yo <- exp(lnodds.70yo) / (1 + exp(lnodds.70yo)) | |
odds_lo <- lnodds.70yo + 1.96 * se.70yo | |
odds_hi <- lnodds.70yo - 1.96 * se.70yo | |
ci_odds_lo <- exp(odds_lo) | |
ci_odds_lo | |
ci_odds_hi <- exp(odds_hi) | |
ci_odds_hi | |
ci_probs_lo <- ci_odds_lo / (1 + ci_odds_lo) | |
ci_probs_hi <- ci_odds_hi / (1 + ci_odds_hi) | |
# ======================================== | |
# ==============[ Q 3.2.4 ]=============== | |
# ======================================== | |
# First we calculate the W matrix | |
yhat <- predict(model.current, type="response") | |
W <- diag(yhat * (1 - yhat)) | |
X <- cbind(1, age, age.sq) | |
cov_matrix <- solve(t(X) %*% W %*% X) | |
# Did we get the correct result? | |
summary(model.current)$cov.unscaled | |
cov_matrix | |
# Indeedidoodely we did! Amazing |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment