sebnyberg · May 11, 2017 15:02
diff --git a/3.1.R b/3.1.R
 # ========================================
 # ==============[ Q 3.1.1 ]===============
 # ========================================

 # load data
 load('hilda.Rdata')
 source('helpers.R')

 # redifine the hospitaldays to be 1 for anything above 1
 hilda$hospdays[(hilda$hospdays >= 1)] = 1
 hilda$hospdays

 # redefine all answers which were "other" or "not answered" to be NA
 hilda$health[(hilda$health >= 4)] = NA
 hilda$health <- factor(hilda$health, labels=c("good", "bad", "inbetween"))
 hilda$health

 # check that all covariates are sigificantly different from zero
 model.no.ref <- glm(hilda$hospdays ~ hilda$health - 1, family="binomial")
 #                                      ^^^^^^ this makes categories non-relative to the reference
 summary(model.no.ref)

 # they are significantly different from zero
 # lets try a model with healthgood as a reference category
 model.ref.good <- glm(hilda$hospdays ~ hilda$health, family="binomial")
 summary(model.ref.good)

 # lets check if healthbad is significantly different from health inbetween
 model.ref.bad <- glm(hilda$hospdays ~ factor(hilda$health, levels=c("bad","inbetween","good")), family="binomial")
 summary(model.ref.bad)

 # inbetween is not significantly different from bad (the reference category)... 

 # we should consider using a model which merges the categories. 
 # Lets check the ANOVA for the merged vs split model.

 # create merged health
 healthmerged <- hilda$health
 levels(healthmerged) <- c("good", "not good", "not good")

 # compare models
 model.original <- glm(hilda$hospdays ~ hilda$health, family=binomial)
 model.merged <- glm(hilda$hospdays ~ healthmerged, family=binomial)

 # anova will use a likelihood ratio test to compare the models
 anova(model.merged, model.original)
 qchisq(1-0.05,1)

 # since 3.444 < 3.841459 we can not reject the null hypothesis (that the two models are the same)
 # thus, merging the models does not result in a loss or improvement either way
 model.health <- model.merged

 # merge the groups, since the Pr(>|z|) was high (0.6)
 hilda$health <- healthmerged


 # ========================================
 # ==============[ Q 3.1.2 ]===============
 # ========================================


 # since the model coefficients are w.r.t. the reference category (good)
 # the odds ratio for people with not good as compared to good is
 ln.or.notgood <- model.health$coefficients[1]
 or.notgood <- exp(ln.or.notgood)

 # the S.E. is the second diagonal element in the covariance matrix
 # of our model
 se <- sqrt(diag(summary(model.health)$cov.unscaled))
 se.or.notgood <- se[2]

 # calculate bounds
 lo <- ln.or.notgood - 1.96 * se.or.notgood
 hi <- ln.or.notgood + 1.96 * se.or.notgood

 # calculate confidence interval for 
 ci.or.notgood.lo <- exp(lo)
 ci.or.notgood.hi <- exp(hi)


 # ========================================
 # ==============[ Q 3.1.3 ]===============
 # ========================================

 beta0 <- model.health$coefficients[1]
 beta1 <- model.health$coefficients[2]

 ln.odds.notgood <- beta0 + beta1
 odds.notgood <- exp(ln.odds.notgood)

 se.notgood <- sqrt(se[2]^2 - se[1]^2)

 # calculate bounds
 lo.notgood <- ln.odds.notgood - 1.96 * se.notgood
 hi.notgood <- ln.odds.notgood + 1.96 * se.notgood

 # calculate confidence interval for odds of not good
 ci.notgood.lo <- exp(lo.notgood)
 ci.notgood.hi <- exp(hi.notgood)



 # ========================================
 # ==============[ Q 3.1.4 ]===============
 # ========================================

 # probability of having at least one hospital day for people with good health:

 odds.good <- exp(beta0)
 prob.good <- odds.good / (1 + odds.good)

 se.good <- se[1]

 lo.good <- beta0 - 1.96 * se.good
 hi.good <- beta0 + 1.96 * se.good

 ci.prob.good.lo <- exp(lo.good) / (1 + exp(lo.good))
 ci.prob.good.hi <- exp(hi.good) / (1 + exp(hi.good))

 # for not good health

 prob.notgood <- odds.notgood / (1 + odds.notgood)

 ci.prob.notgood.lo <- exp(lo.notgood) / (1 + exp(lo.notgood))
 ci.prob.notgood.hi <- exp(hi.notgood) / (1 + exp(hi.notgood))
	# ========================================
	# ==============[ Q 3.1.1 ]===============
	# ========================================

	# load data
	load('hilda.Rdata')
	source('helpers.R')

	# redifine the hospitaldays to be 1 for anything above 1
	hilda$hospdays[(hilda$hospdays >= 1)] = 1
	hilda$hospdays

	# redefine all answers which were "other" or "not answered" to be NA
	hilda$health[(hilda$health >= 4)] = NA
	hilda$health <- factor(hilda$health, labels=c("good", "bad", "inbetween"))
	hilda$health

	# check that all covariates are sigificantly different from zero
	model.no.ref <- glm(hilda$hospdays ~ hilda$health - 1, family="binomial")
	# ^^^^^^ this makes categories non-relative to the reference
	summary(model.no.ref)

	# they are significantly different from zero
	# lets try a model with healthgood as a reference category
	model.ref.good <- glm(hilda$hospdays ~ hilda$health, family="binomial")
	summary(model.ref.good)

	# lets check if healthbad is significantly different from health inbetween
	model.ref.bad <- glm(hilda$hospdays ~ factor(hilda$health, levels=c("bad","inbetween","good")), family="binomial")
	summary(model.ref.bad)

	# inbetween is not significantly different from bad (the reference category)...

	# we should consider using a model which merges the categories.
	# Lets check the ANOVA for the merged vs split model.

	# create merged health
	healthmerged <- hilda$health
	levels(healthmerged) <- c("good", "not good", "not good")

	# compare models
	model.original <- glm(hilda$hospdays ~ hilda$health, family=binomial)
	model.merged <- glm(hilda$hospdays ~ healthmerged, family=binomial)

	# anova will use a likelihood ratio test to compare the models
	anova(model.merged, model.original)
	qchisq(1-0.05,1)

	# since 3.444 < 3.841459 we can not reject the null hypothesis (that the two models are the same)
	# thus, merging the models does not result in a loss or improvement either way
	model.health <- model.merged

	# merge the groups, since the Pr(>\|z\|) was high (0.6)
	hilda$health <- healthmerged


	# ========================================
	# ==============[ Q 3.1.2 ]===============
	# ========================================


	# since the model coefficients are w.r.t. the reference category (good)
	# the odds ratio for people with not good as compared to good is
	ln.or.notgood <- model.health$coefficients[1]
	or.notgood <- exp(ln.or.notgood)

	# the S.E. is the second diagonal element in the covariance matrix
	# of our model
	se <- sqrt(diag(summary(model.health)$cov.unscaled))
	se.or.notgood <- se[2]

	# calculate bounds
	lo <- ln.or.notgood - 1.96 * se.or.notgood
	hi <- ln.or.notgood + 1.96 * se.or.notgood

	# calculate confidence interval for
	ci.or.notgood.lo <- exp(lo)
	ci.or.notgood.hi <- exp(hi)


	# ========================================
	# ==============[ Q 3.1.3 ]===============
	# ========================================

	beta0 <- model.health$coefficients[1]
	beta1 <- model.health$coefficients[2]

	ln.odds.notgood <- beta0 + beta1
	odds.notgood <- exp(ln.odds.notgood)

	se.notgood <- sqrt(se[2]^2 - se[1]^2)

	# calculate bounds
	lo.notgood <- ln.odds.notgood - 1.96 * se.notgood
	hi.notgood <- ln.odds.notgood + 1.96 * se.notgood

	# calculate confidence interval for odds of not good
	ci.notgood.lo <- exp(lo.notgood)
	ci.notgood.hi <- exp(hi.notgood)



	# ========================================
	# ==============[ Q 3.1.4 ]===============
	# ========================================

	# probability of having at least one hospital day for people with good health:

	odds.good <- exp(beta0)
	prob.good <- odds.good / (1 + odds.good)

	se.good <- se[1]

	lo.good <- beta0 - 1.96 * se.good
	hi.good <- beta0 + 1.96 * se.good

	ci.prob.good.lo <- exp(lo.good) / (1 + exp(lo.good))
	ci.prob.good.hi <- exp(hi.good) / (1 + exp(hi.good))

	# for not good health

	prob.notgood <- odds.notgood / (1 + odds.notgood)

	ci.prob.notgood.lo <- exp(lo.notgood) / (1 + exp(lo.notgood))
	ci.prob.notgood.hi <- exp(hi.notgood) / (1 + exp(hi.notgood))