sebnyberg · May 11, 2017 15:01
diff --git a/3.2.R b/3.2.R

 # ========================================
 # ==============[ Q 3.2.1 ]===============
 # ========================================
 load('hilda.Rdata')
 source('helpers.R')

 hilda <- setup_data_base(hilda)

 # form the first model by splitting the ages into groups of five
 hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85,90,95))

 # compare number of samples in the intervals
 hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10), 
     main="Number of people per age interval",
     xlab="Age interval")

 # save
 pdf(file="3_2_1-hist-10categories.pdf")
 hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10), 
     main="Number of people per age interval",
     xlab="Age interval")
 dev.off()

 # merge last two categories for a more even distribution of data 
 hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85))

 # compare number of samples in the intervals
 hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8), 
     main="Number of people per age interval",
     xlab="Age interval")

 # save 
 pdf(file="3_2_1-hist-8categories.pdf")
 hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
     main="Number of people per age interval",
     xlab="Age interval")
 dev.off()

 # rename the factors..
 hilda$age.interval <- factor(hilda$age.interval, 
                             labels=c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84","85+"))

 # make a list of failures
 hilda$no.hospdays <- 1 - hilda$hospdays

 # looks alright.. compare the values of the different proportions
 # let's "aggregate" the values of x1, success, fails, sum_p according to the remaining variables
 # which are x2 and gr. Then the summation function is applied to aggregated values.
 agg<-aggregate(hilda[,c("hospdays","no.hospdays")],by=list(age.interval=hilda$age.interval),FUN=sum)
 agg$total_events <- agg$hospdays + agg$no.hospdays

 # find median for our categories
 i = 1
 agg$xmid <- NA
 for (interval in levels(hilda$age.interval)) {
  agg$xmid[i] <- median(hilda$age[hilda$age.interval == interval])
  i = i + 1
 }

 # calculate proportions
 agg$prop<-agg$hospdays/agg$total_events

 # plot the proportions
 plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
     type="l", xlab = "age", ylab = "probability of spending at least one hospital day")

 # save
 pdf("3_2_1-proportions.pdf")
 plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
     type="l", xlab = "age", ylab = "probability of spending at least one hospital day")


 # ========================================
 # ==============[ Q 3.2.2 ]===============
 # ========================================

 # create the continuous model
 model.age <- glm(hilda$hospdays ~ hilda$age, family = binomial)

 # create a continuous model with a second order term
 age <- hilda$age
 age.sq <- hilda$age^2
 model.age.trans <- glm(hilda$hospdays ~ age + age.sq^2, family = binomial)

 # in order to plot we make predictions for a sequence of x-values
 x1 <- seq(50,100,1)

 odds.trans <- get_log_odds(model.age.trans, cbind(x1, x1^2))
 odds.cont  <- get_log_odds(model.age, cbind(x1))
 probs.trans <- odds.trans / (1 + odds.trans)
 probs.cont <- odds.cont / (1 + odds.cont)

 # plot proportions vs continous without transformation
 plot(x1, probs.cont,
     type="l", 
     ylim = c(0.05,0.40), 
     xlim = c(50,100), 
     xlab = "age", 
     ylab = "probability of spending a hospital day", 
     main="Proportions of categorical age\n vs probabilities of continuous age")
 lines(agg$xmid, agg$prop, lty=2, col="blue")
 points(agg$xmid, agg$prop, pch=8, col="blue")

 # save to file
 pdf(file="3_2_2-no-transformation.pdf")
 plot(x1, probs.cont,
     type="l", 
     ylim = c(0.05,0.40), 
     xlim = c(50,100), 
     xlab = "age", 
     ylab = "probability of spending a hospital day", 
     main="Proportions of categorical age\n vs probabilities of continuous age")
 lines(agg$xmid, agg$prop, lty=2, col="blue")
 points(agg$xmid, agg$prop, pch=8, col="blue")
 dev.off()

 # plot proportions vs our continuous logistic regression
 plot(x1, probs.trans, 
     type="l", 
     ylim = c(0.05,0.30), 
     xlim = c(50,100), 
     xlab = "age", 
     ylab = "probability of spending a hospital day", 
     main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
 lines(agg$xmid, agg$prop, lty=2, col="blue")
 points(agg$xmid, agg$prop, pch=8, col="blue")

 # save to file
 pdf(file="3_2_2-with-transformation.pdf")
 plot(x1, probs.trans, 
     type="l", 
     ylim = c(0.05,0.30), 
     xlim = c(50,100), 
     xlab = "age", 
     ylab = "probability of spending a hospital day", 
     main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
 lines(agg$xmid, agg$prop, lty=2, col="blue")
 points(agg$xmid, agg$prop, pch=8, col="blue")
 dev.off()

 model.current <- model.age.trans


 # ========================================
 # ==============[ Q 3.2.3 ]===============
 # ========================================

 # Hack... get the se.fit from the predict function and find someone who is 70 years old..

 index.70yo <- min(which(hilda$age == 70))
 xbhat <- predict(model.age.trans, se.fit = T)
 se.70yo <- xbhat$se.fit[index.70yo]

 summary(model.age.trans)

 lnodds.70yo <- get_log_odds(model.age.trans, c(1, 70, 70^2))

 probs.70yo <- exp(lnodds.70yo) / (1 + exp(lnodds.70yo))

 odds_lo <- lnodds.70yo + 1.96 * se.70yo
 odds_hi <- lnodds.70yo - 1.96 * se.70yo

 ci_odds_lo <- exp(odds_lo)
 ci_odds_lo
 ci_odds_hi <- exp(odds_hi)
 ci_odds_hi

 ci_probs_lo <- ci_odds_lo / (1 + ci_odds_lo)
 ci_probs_hi <- ci_odds_hi / (1 + ci_odds_hi)


 # ========================================
 # ==============[ Q 3.2.4 ]===============
 # ========================================

 # First we calculate the W matrix 
 yhat <- predict(model.current, type="response")
 W <- diag(yhat * (1 - yhat))
 X <- cbind(1, age, age.sq)
 cov_matrix <- solve(t(X) %*% W %*% X)

 # Did we get the correct result?
 summary(model.current)$cov.unscaled
 cov_matrix

 # Indeedidoodely we did! Amazing

	# ========================================
	# ==============[ Q 3.2.1 ]===============
	# ========================================
	load('hilda.Rdata')
	source('helpers.R')

	hilda <- setup_data_base(hilda)

	# form the first model by splitting the ages into groups of five
	hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85,90,95))

	# compare number of samples in the intervals
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
	main="Number of people per age interval",
	xlab="Age interval")

	# save
	pdf(file="3_2_1-hist-10categories.pdf")
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
	main="Number of people per age interval",
	xlab="Age interval")
	dev.off()

	# merge last two categories for a more even distribution of data
	hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85))

	# compare number of samples in the intervals
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
	main="Number of people per age interval",
	xlab="Age interval")

	# save
	pdf(file="3_2_1-hist-8categories.pdf")
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
	main="Number of people per age interval",
	xlab="Age interval")
	dev.off()

	# rename the factors..
	hilda$age.interval <- factor(hilda$age.interval,
	labels=c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84","85+"))

	# make a list of failures
	hilda$no.hospdays <- 1 - hilda$hospdays

	# looks alright.. compare the values of the different proportions
	# let's "aggregate" the values of x1, success, fails, sum_p according to the remaining variables
	# which are x2 and gr. Then the summation function is applied to aggregated values.
	agg<-aggregate(hilda[,c("hospdays","no.hospdays")],by=list(age.interval=hilda$age.interval),FUN=sum)
	agg$total_events <- agg$hospdays + agg$no.hospdays

	# find median for our categories
	i = 1
	agg$xmid <- NA
	for (interval in levels(hilda$age.interval)) {
	agg$xmid[i] <- median(hilda$age[hilda$age.interval == interval])
	i = i + 1
	}

	# calculate proportions
	agg$prop<-agg$hospdays/agg$total_events

	# plot the proportions
	plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
	type="l", xlab = "age", ylab = "probability of spending at least one hospital day")

	# save
	pdf("3_2_1-proportions.pdf")
	plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
	type="l", xlab = "age", ylab = "probability of spending at least one hospital day")


	# ========================================
	# ==============[ Q 3.2.2 ]===============
	# ========================================

	# create the continuous model
	model.age <- glm(hilda$hospdays ~ hilda$age, family = binomial)

	# create a continuous model with a second order term
	age <- hilda$age
	age.sq <- hilda$age^2
	model.age.trans <- glm(hilda$hospdays ~ age + age.sq^2, family = binomial)

	# in order to plot we make predictions for a sequence of x-values
	x1 <- seq(50,100,1)

	odds.trans <- get_log_odds(model.age.trans, cbind(x1, x1^2))
	odds.cont <- get_log_odds(model.age, cbind(x1))
	probs.trans <- odds.trans / (1 + odds.trans)
	probs.cont <- odds.cont / (1 + odds.cont)

	# plot proportions vs continous without transformation
	plot(x1, probs.cont,
	type="l",
	ylim = c(0.05,0.40),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\n vs probabilities of continuous age")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")

	# save to file
	pdf(file="3_2_2-no-transformation.pdf")
	plot(x1, probs.cont,
	type="l",
	ylim = c(0.05,0.40),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\n vs probabilities of continuous age")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")
	dev.off()

	# plot proportions vs our continuous logistic regression
	plot(x1, probs.trans,
	type="l",
	ylim = c(0.05,0.30),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")

	# save to file
	pdf(file="3_2_2-with-transformation.pdf")
	plot(x1, probs.trans,
	type="l",
	ylim = c(0.05,0.30),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")
	dev.off()

	model.current <- model.age.trans


	# ========================================
	# ==============[ Q 3.2.3 ]===============
	# ========================================

	# Hack... get the se.fit from the predict function and find someone who is 70 years old..

	index.70yo <- min(which(hilda$age == 70))
	xbhat <- predict(model.age.trans, se.fit = T)
	se.70yo <- xbhat$se.fit[index.70yo]

	summary(model.age.trans)

	lnodds.70yo <- get_log_odds(model.age.trans, c(1, 70, 70^2))

	probs.70yo <- exp(lnodds.70yo) / (1 + exp(lnodds.70yo))

	odds_lo <- lnodds.70yo + 1.96 * se.70yo
	odds_hi <- lnodds.70yo - 1.96 * se.70yo

	ci_odds_lo <- exp(odds_lo)
	ci_odds_lo
	ci_odds_hi <- exp(odds_hi)
	ci_odds_hi

	ci_probs_lo <- ci_odds_lo / (1 + ci_odds_lo)
	ci_probs_hi <- ci_odds_hi / (1 + ci_odds_hi)


	# ========================================
	# ==============[ Q 3.2.4 ]===============
	# ========================================

	# First we calculate the W matrix
	yhat <- predict(model.current, type="response")
	W <- diag(yhat * (1 - yhat))
	X <- cbind(1, age, age.sq)
	cov_matrix <- solve(t(X) %% W %% X)

	# Did we get the correct result?
	summary(model.current)$cov.unscaled
	cov_matrix

	# Indeedidoodely we did! Amazing