sebnyberg · May 12, 2017 13:06
diff --git a/3.4.R b/3.4.R
 library(ggplot2)
 source('helpers.R')



 # ======================================
 # ==============[ Q 3.4 ]===============
 # ======================================

 # prepare the model by defining factors
 load('hilda.Rdata')
 hilda <- setup_data_base(hilda)
 hilda <- setup_data(hilda)


 model.current <- glm(hilda$hospdays ~ hilda$age + age.sq + hilda$health + hilda$sex, family=binomial)
 model.full <- glm(hilda$hospdays ~ hilda$sex + hilda$brthyr + hilda$age + age.sq + hilda$civilst_rtb + 
                    hilda$civilst + hilda$educ + hilda$child_small + hilda$child_grown + 
                    hilda$exercise + hilda$health + hilda$work_norm + hilda$inc_hh + 
                    hilda$inc_salary + hilda$inc_work + hilda$inc_cap + hilda$inc_pens, family=binomial)

 step(model.current, scope=list(lower=model.current, upper=model.full), direction="both", steps = 1)


 # exercise seems like a good category to introduce, but many did not answer
 # or were not asked.. we need to inspect a subset of the entire dataset
 model.new <- update(model.current, . ~ . + hilda$exercise)
 compare_models(model.current, model.new)
 NROW(model.new$fitted.values)


 # about half the subjects were not asked about whether or not they exercise.
 # we can however use those who did answer to check if there is any interesting
 # relationship between exercise and hospital days

 # ==============[ EXERCISE ]===============


 load('hilda.Rdata')
 hilda2 <- setup_data_base(hilda)

 # exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
 # not asked has 4 entries, dont know has 0, no answer has 26
 hilda2$exercise[hilda2$exercise %in% c(0,8,9)] <- NA
 hilda2$exercise <- factor(hilda2$exercise, 
                          labels = c("practically none", "sometimes" ,"once a week", 
                                     "twice a week", "at least twice"),
                          levels = c(1,2,3,4,5))


 hilda2 <- na.omit(hilda2)

 # plot histogram, looks alright
 hist(as.numeric(hilda$exercise))

 # lets look at the probabilities for the categories..
 agg <- get_aggregate(hilda2, hilda2$exercise)
 agg
 plot(agg$variable, agg$props)

 levels(hilda2$exercise)
 hilda2$exercise

 model.exercise <- glm(hilda2$hospdays ~ hilda2$exercise, family=binomial)
 summary(model.exercise)

 age.sq2 <- hilda2$age^2
 levels(hilda2$health) <- c("good", "not good", "not good")

 model.age <- glm(hilda2$hospdays ~ hilda2$age, family=binomial)
 model.sex <- glm(hilda2$hospdays ~ hilda2$sex, family=binomial)
 model.health <- glm(hilda2$hospdays ~ hilda2$health, family=binomial)

 # not significant
 model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
 compare_models(model.health, model.new)

 # significant
 model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
 compare_models(model.age, model.new)

 # significant
 model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
 compare_models(model.sex, model.new)

 # what if we merge the groups to exercise yes/no?
 levels(hilda2$exercise) <- c("no", rep("yes", NROW(levels(hilda2$exercise)) - 1))
 agg <- get_aggregate(hilda2, hilda2$exercise)
 agg
 plot(agg$variable, agg$props)

 # lets compare again

 # significant (just barely)
 model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
 compare_models(model.health, model.new)

 # significant
 model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
 compare_models(model.age, model.new)

 # significant
 model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
 compare_models(model.sex, model.new)

 # what does this tell us? it does seem that exercise is correlated with health
 # which makes sense.. 

 # we need to relevel exercise so that its yes / no against good / not good health
 cor(as.numeric(relevel(hilda2$exercise, ref=2)), as.numeric(hilda2$health))

 # sanity check, this should be much lower... 
 cor(as.numeric(hilda2$exercise), as.numeric(hilda2$sex))



 # ==============[ Household income ]===============

 load('hilda.Rdata')
 hilda <- setup_data_base(hilda)


 hist(hilda$inc_hh[hilda$inc_hh <= 5000])

 # form the first model by splitting the ages into groups of five
 hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(500,1000,1500,2000))
 hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4,5), include.lowest=T, right=F)

 # first group is extremely small (141)... merge with second lowest
 NROW(hilda$inc_hh[hilda$inc_hh.interval == 0])
 hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(1000,1500,2000))
 hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4), include.lowest=T, right=F)

 # looks evenly distributed.. lets rename the categories

 # rename the factors..
 hilda$inc_hh.interval <- factor(hilda$inc_hh.interval, 
                                labels=c("1-999", "1000-1499", "1500-1999", "2000+"))

 agg <- get_aggregate(hilda, hilda$inc_hh.interval)
 agg
 plot(agg$variable, agg$props)


 # although this data is continuous, it looks like it makes sense to split it into 
 # two halves, one for 1-1499 and one for 1500+, lets test this hypothesis

 model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
 summary(model.income)

 # 1000-1499 is not significantly different from 0-1000..

 # lets compare category 3 and 4
 model.income <- glm(hilda$hospdays ~ relevel(hilda$inc_hh.interval, ref=3), family=binomial)
 summary(model.income)
 # as expected, 2000+ is not significantly different from 1500-1999

 # lets merge and make the model (DO NOT RUN THIS TWICE)
 if (NROW(levels(hilda$inc_hh.interval)) > 2){
  levels(hilda$inc_hh.interval) <- c("low", "low", "high", "high")
 }
 model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
 summary(model.income)

 # now lets compare the model against other models such as age, sex, exercise

 # first we merge categories as per usual
 levels(hilda$health) <- c("good", "not good", "not good")

 model.age <- glm(hilda$hospdays ~ hilda$age, family=binomial)
 model.sex <- glm(hilda$hospdays ~ hilda$sex, family=binomial)
 model.health <- glm(hilda$hospdays ~ hilda$health, family=binomial)

 # significant
 model.new <- update(model.health, . ~ . + hilda$inc_hh.interval, family=binomial)
 compare_models(model.health, model.new)

 # not significant (?)
 model.new <- update(model.age, . ~ . + hilda$inc_hh.interval, family=binomial)
 compare_models(model.age, model.new)

 # significant
 model.new <- update(model.sex, . ~ . + hilda$inc_hh.interval, family=binomial)
 compare_models(model.sex, model.new)

 # lets compare against a more complex model.

 model.prev <- glm(hilda$hospdays ~ hilda$health + hilda$age + age.sq + hilda$sex, family=binomial)
 model.new <- update(model.prev, . ~ . + hilda$inc_hh.interval, family=binomial)
 compare_models(model.prev, model.new)
	library(ggplot2)
	source('helpers.R')



	# ======================================
	# ==============[ Q 3.4 ]===============
	# ======================================

	# prepare the model by defining factors
	load('hilda.Rdata')
	hilda <- setup_data_base(hilda)
	hilda <- setup_data(hilda)


	model.current <- glm(hilda$hospdays ~ hilda$age + age.sq + hilda$health + hilda$sex, family=binomial)
	model.full <- glm(hilda$hospdays ~ hilda$sex + hilda$brthyr + hilda$age + age.sq + hilda$civilst_rtb +
	hilda$civilst + hilda$educ + hilda$child_small + hilda$child_grown +
	hilda$exercise + hilda$health + hilda$work_norm + hilda$inc_hh +
	hilda$inc_salary + hilda$inc_work + hilda$inc_cap + hilda$inc_pens, family=binomial)

	step(model.current, scope=list(lower=model.current, upper=model.full), direction="both", steps = 1)


	# exercise seems like a good category to introduce, but many did not answer
	# or were not asked.. we need to inspect a subset of the entire dataset
	model.new <- update(model.current, . ~ . + hilda$exercise)
	compare_models(model.current, model.new)
	NROW(model.new$fitted.values)


	# about half the subjects were not asked about whether or not they exercise.
	# we can however use those who did answer to check if there is any interesting
	# relationship between exercise and hospital days

	# ==============[ EXERCISE ]===============


	load('hilda.Rdata')
	hilda2 <- setup_data_base(hilda)

	# exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
	# not asked has 4 entries, dont know has 0, no answer has 26
	hilda2$exercise[hilda2$exercise %in% c(0,8,9)] <- NA
	hilda2$exercise <- factor(hilda2$exercise,
	labels = c("practically none", "sometimes" ,"once a week",
	"twice a week", "at least twice"),
	levels = c(1,2,3,4,5))


	hilda2 <- na.omit(hilda2)

	# plot histogram, looks alright
	hist(as.numeric(hilda$exercise))

	# lets look at the probabilities for the categories..
	agg <- get_aggregate(hilda2, hilda2$exercise)
	agg
	plot(agg$variable, agg$props)

	levels(hilda2$exercise)
	hilda2$exercise

	model.exercise <- glm(hilda2$hospdays ~ hilda2$exercise, family=binomial)
	summary(model.exercise)

	age.sq2 <- hilda2$age^2
	levels(hilda2$health) <- c("good", "not good", "not good")

	model.age <- glm(hilda2$hospdays ~ hilda2$age, family=binomial)
	model.sex <- glm(hilda2$hospdays ~ hilda2$sex, family=binomial)
	model.health <- glm(hilda2$hospdays ~ hilda2$health, family=binomial)

	# not significant
	model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.health, model.new)

	# significant
	model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.age, model.new)

	# significant
	model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.sex, model.new)

	# what if we merge the groups to exercise yes/no?
	levels(hilda2$exercise) <- c("no", rep("yes", NROW(levels(hilda2$exercise)) - 1))
	agg <- get_aggregate(hilda2, hilda2$exercise)
	agg
	plot(agg$variable, agg$props)

	# lets compare again

	# significant (just barely)
	model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.health, model.new)

	# significant
	model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.age, model.new)

	# significant
	model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.sex, model.new)

	# what does this tell us? it does seem that exercise is correlated with health
	# which makes sense..

	# we need to relevel exercise so that its yes / no against good / not good health
	cor(as.numeric(relevel(hilda2$exercise, ref=2)), as.numeric(hilda2$health))

	# sanity check, this should be much lower...
	cor(as.numeric(hilda2$exercise), as.numeric(hilda2$sex))



	# ==============[ Household income ]===============

	load('hilda.Rdata')
	hilda <- setup_data_base(hilda)


	hist(hilda$inc_hh[hilda$inc_hh <= 5000])

	# form the first model by splitting the ages into groups of five
	hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(500,1000,1500,2000))
	hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4,5), include.lowest=T, right=F)

	# first group is extremely small (141)... merge with second lowest
	NROW(hilda$inc_hh[hilda$inc_hh.interval == 0])
	hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(1000,1500,2000))
	hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4), include.lowest=T, right=F)

	# looks evenly distributed.. lets rename the categories

	# rename the factors..
	hilda$inc_hh.interval <- factor(hilda$inc_hh.interval,
	labels=c("1-999", "1000-1499", "1500-1999", "2000+"))

	agg <- get_aggregate(hilda, hilda$inc_hh.interval)
	agg
	plot(agg$variable, agg$props)


	# although this data is continuous, it looks like it makes sense to split it into
	# two halves, one for 1-1499 and one for 1500+, lets test this hypothesis

	model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
	summary(model.income)

	# 1000-1499 is not significantly different from 0-1000..

	# lets compare category 3 and 4
	model.income <- glm(hilda$hospdays ~ relevel(hilda$inc_hh.interval, ref=3), family=binomial)
	summary(model.income)
	# as expected, 2000+ is not significantly different from 1500-1999

	# lets merge and make the model (DO NOT RUN THIS TWICE)
	if (NROW(levels(hilda$inc_hh.interval)) > 2){
	levels(hilda$inc_hh.interval) <- c("low", "low", "high", "high")
	}
	model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
	summary(model.income)

	# now lets compare the model against other models such as age, sex, exercise

	# first we merge categories as per usual
	levels(hilda$health) <- c("good", "not good", "not good")

	model.age <- glm(hilda$hospdays ~ hilda$age, family=binomial)
	model.sex <- glm(hilda$hospdays ~ hilda$sex, family=binomial)
	model.health <- glm(hilda$hospdays ~ hilda$health, family=binomial)

	# significant
	model.new <- update(model.health, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.health, model.new)

	# not significant (?)
	model.new <- update(model.age, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.age, model.new)

	# significant
	model.new <- update(model.sex, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.sex, model.new)

	# lets compare against a more complex model.

	model.prev <- glm(hilda$hospdays ~ hilda$health + hilda$age + age.sq + hilda$sex, family=binomial)
	model.new <- update(model.prev, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.prev, model.new)