Created
May 12, 2017 13:06
-
-
Save sebnyberg/9977c2366aed841f725b26301f49c0ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
source('helpers.R') | |
# ====================================== | |
# ==============[ Q 3.4 ]=============== | |
# ====================================== | |
# prepare the model by defining factors | |
load('hilda.Rdata') | |
hilda <- setup_data_base(hilda) | |
hilda <- setup_data(hilda) | |
model.current <- glm(hilda$hospdays ~ hilda$age + age.sq + hilda$health + hilda$sex, family=binomial) | |
model.full <- glm(hilda$hospdays ~ hilda$sex + hilda$brthyr + hilda$age + age.sq + hilda$civilst_rtb + | |
hilda$civilst + hilda$educ + hilda$child_small + hilda$child_grown + | |
hilda$exercise + hilda$health + hilda$work_norm + hilda$inc_hh + | |
hilda$inc_salary + hilda$inc_work + hilda$inc_cap + hilda$inc_pens, family=binomial) | |
step(model.current, scope=list(lower=model.current, upper=model.full), direction="both", steps = 1) | |
# exercise seems like a good category to introduce, but many did not answer | |
# or were not asked.. we need to inspect a subset of the entire dataset | |
model.new <- update(model.current, . ~ . + hilda$exercise) | |
compare_models(model.current, model.new) | |
NROW(model.new$fitted.values) | |
# about half the subjects were not asked about whether or not they exercise. | |
# we can however use those who did answer to check if there is any interesting | |
# relationship between exercise and hospital days | |
# ==============[ EXERCISE ]=============== | |
load('hilda.Rdata') | |
hilda2 <- setup_data_base(hilda) | |
# exercise habits (0 = not asked, 8 = dont know, 9 = no answer) | |
# not asked has 4 entries, dont know has 0, no answer has 26 | |
hilda2$exercise[hilda2$exercise %in% c(0,8,9)] <- NA | |
hilda2$exercise <- factor(hilda2$exercise, | |
labels = c("practically none", "sometimes" ,"once a week", | |
"twice a week", "at least twice"), | |
levels = c(1,2,3,4,5)) | |
hilda2 <- na.omit(hilda2) | |
# plot histogram, looks alright | |
hist(as.numeric(hilda$exercise)) | |
# lets look at the probabilities for the categories.. | |
agg <- get_aggregate(hilda2, hilda2$exercise) | |
agg | |
plot(agg$variable, agg$props) | |
levels(hilda2$exercise) | |
hilda2$exercise | |
model.exercise <- glm(hilda2$hospdays ~ hilda2$exercise, family=binomial) | |
summary(model.exercise) | |
age.sq2 <- hilda2$age^2 | |
levels(hilda2$health) <- c("good", "not good", "not good") | |
model.age <- glm(hilda2$hospdays ~ hilda2$age, family=binomial) | |
model.sex <- glm(hilda2$hospdays ~ hilda2$sex, family=binomial) | |
model.health <- glm(hilda2$hospdays ~ hilda2$health, family=binomial) | |
# not significant | |
model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial) | |
compare_models(model.health, model.new) | |
# significant | |
model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial) | |
compare_models(model.age, model.new) | |
# significant | |
model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial) | |
compare_models(model.sex, model.new) | |
# what if we merge the groups to exercise yes/no? | |
levels(hilda2$exercise) <- c("no", rep("yes", NROW(levels(hilda2$exercise)) - 1)) | |
agg <- get_aggregate(hilda2, hilda2$exercise) | |
agg | |
plot(agg$variable, agg$props) | |
# lets compare again | |
# significant (just barely) | |
model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial) | |
compare_models(model.health, model.new) | |
# significant | |
model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial) | |
compare_models(model.age, model.new) | |
# significant | |
model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial) | |
compare_models(model.sex, model.new) | |
# what does this tell us? it does seem that exercise is correlated with health | |
# which makes sense.. | |
# we need to relevel exercise so that its yes / no against good / not good health | |
cor(as.numeric(relevel(hilda2$exercise, ref=2)), as.numeric(hilda2$health)) | |
# sanity check, this should be much lower... | |
cor(as.numeric(hilda2$exercise), as.numeric(hilda2$sex)) | |
# ==============[ Household income ]=============== | |
load('hilda.Rdata') | |
hilda <- setup_data_base(hilda) | |
hist(hilda$inc_hh[hilda$inc_hh <= 5000]) | |
# form the first model by splitting the ages into groups of five | |
hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(500,1000,1500,2000)) | |
hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4,5), include.lowest=T, right=F) | |
# first group is extremely small (141)... merge with second lowest | |
NROW(hilda$inc_hh[hilda$inc_hh.interval == 0]) | |
hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(1000,1500,2000)) | |
hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4), include.lowest=T, right=F) | |
# looks evenly distributed.. lets rename the categories | |
# rename the factors.. | |
hilda$inc_hh.interval <- factor(hilda$inc_hh.interval, | |
labels=c("1-999", "1000-1499", "1500-1999", "2000+")) | |
agg <- get_aggregate(hilda, hilda$inc_hh.interval) | |
agg | |
plot(agg$variable, agg$props) | |
# although this data is continuous, it looks like it makes sense to split it into | |
# two halves, one for 1-1499 and one for 1500+, lets test this hypothesis | |
model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial) | |
summary(model.income) | |
# 1000-1499 is not significantly different from 0-1000.. | |
# lets compare category 3 and 4 | |
model.income <- glm(hilda$hospdays ~ relevel(hilda$inc_hh.interval, ref=3), family=binomial) | |
summary(model.income) | |
# as expected, 2000+ is not significantly different from 1500-1999 | |
# lets merge and make the model (DO NOT RUN THIS TWICE) | |
if (NROW(levels(hilda$inc_hh.interval)) > 2){ | |
levels(hilda$inc_hh.interval) <- c("low", "low", "high", "high") | |
} | |
model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial) | |
summary(model.income) | |
# now lets compare the model against other models such as age, sex, exercise | |
# first we merge categories as per usual | |
levels(hilda$health) <- c("good", "not good", "not good") | |
model.age <- glm(hilda$hospdays ~ hilda$age, family=binomial) | |
model.sex <- glm(hilda$hospdays ~ hilda$sex, family=binomial) | |
model.health <- glm(hilda$hospdays ~ hilda$health, family=binomial) | |
# significant | |
model.new <- update(model.health, . ~ . + hilda$inc_hh.interval, family=binomial) | |
compare_models(model.health, model.new) | |
# not significant (?) | |
model.new <- update(model.age, . ~ . + hilda$inc_hh.interval, family=binomial) | |
compare_models(model.age, model.new) | |
# significant | |
model.new <- update(model.sex, . ~ . + hilda$inc_hh.interval, family=binomial) | |
compare_models(model.sex, model.new) | |
# lets compare against a more complex model. | |
model.prev <- glm(hilda$hospdays ~ hilda$health + hilda$age + age.sq + hilda$sex, family=binomial) | |
model.new <- update(model.prev, . ~ . + hilda$inc_hh.interval, family=binomial) | |
compare_models(model.prev, model.new) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment