Created
May 11, 2017 15:00
-
-
Save sebnyberg/1b6588d9730f5380cdd26e6b7b403a7a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
# returns the odds of a model given an input vector x | |
get_log_odds <- function(model,x) { | |
n_coeff <- length(model$coefficients) | |
if (n_coeff - length(x) == 1) { | |
# user forgot the first 1 in [1, x1,...] | |
x <- c(1,x) | |
} else if (length(x) > n_coeff + 1 | |
&& n_coeff - NCOL(x) == 1) { | |
x <- cbind(1, x) | |
} | |
if (n_coeff != NCOL(x) && | |
n_coeff != length(x)) { | |
expected_n <- n_coeff - 1 | |
print(noquote(c("X is not of correct length, each row should be of size", expected_n))) | |
return(NA) | |
} | |
# calculate the odds | |
log_odds = x %*% model$coefficients | |
return(log_odds) | |
} | |
# creates an aggregate table of proportions for some variable | |
get_aggregate <- function(dataset, list) { | |
if (max(dataset$hospdays > 1)) { | |
dataset$hospdays[dataset$hospdays >= 1] <- 1 | |
} | |
if (!(c("no.hospdays") %in% colnames(dataset))) { | |
dataset$no.hospdays <- 1 - dataset$hospdays | |
} | |
res <- aggregate(dataset[,c("hospdays","no.hospdays")],by=list(variable=list),FUN=sum) | |
res$total <- res$hospdays + res$no.hospdays | |
res$props <- res$hospdays / res$total | |
return(res) | |
} | |
# compares two logistic models by calculating the log likelihood | |
# and testing against the chi squared distribution at 95% | |
compare_models <- function(current, new) { | |
d.new <- -2*logLik(new) | |
d.current <- -2*logLik(current) | |
d.diff <- d.current - d.new | |
result <- NA | |
retval <- NA | |
qquant <- NA | |
df = NROW(new$coefficients) - NROW(current$coefficients) | |
if (df < 1) { | |
print(noquote("cannot use chisq since df is the same or lower")) | |
} | |
else { | |
qquant <- qchisq(1-0.05,df) | |
if (d.diff > qquant) { | |
result <- "Yes" | |
retval <- TRUE | |
} else { | |
result <- "No" | |
retval <- FALSE | |
} | |
} | |
result.frame = data.frame(d.new = c(d.new), | |
d.current = c(d.current), | |
d.diff = c(d.diff), | |
df = c(df), | |
qquant = c(qquant), | |
reject.h0 = c(result)) | |
print(result.frame) | |
return(retval) | |
} | |
setup_data_base <- function(data) { | |
# redefine the hospitaldays to be 1 for anything above 1 | |
data$hospdays[(data$hospdays >= 1)] = 1 | |
# count no.hospdays to be the complement of hospdays | |
data$no.hospdays = 1 - data$hospdays | |
# define sex as a factor | |
data$sex <- factor(data$sex, labels=c("male", "female"), levels=c(1,2)) | |
# redefine all answers which were "other" or "not answered" to be NA | |
data$health[(data$health >= 4)] = NA | |
data$health <- factor(data$health, labels=c("good", "bad", "inbetween")) | |
return(data) | |
} | |
setup_data <- function(data) { | |
#load('hilda.Rdata', envir=globalenv()) | |
#hildacopy <- hilda | |
data <- setup_data_base(data) | |
# civilst from registry | |
data$civilst_rtb <- factor(data$civilst_rtb, | |
labels = c("unmarried", "married man", | |
"married woman living apart", | |
"divorced", "widow/widower", "dead", | |
"married womand with husband", "child<18", | |
"foster child<18"), | |
levels = c(1,2,3,4,5,6,7,8,9)) | |
# civilst in interview (9 = no answer) | |
data$civilst[data$civilst == 9] <- NA | |
data$civilst <- factor(data$civilst, | |
labels = c("unmarried", "married", | |
"divorced/separated", "widow/widower"), | |
levels = c(1,2,3,4)) | |
# education (0 = not asked) | |
data$educ[data$educ == 0] <- NA | |
data$educ <- factor(data$educ, | |
labels = c("pregymnasial<9", "pregymnasial9-10", | |
"gymnasial<2", "gymnasial2-3", "post-gymnasial<3", | |
"post-gymnasial>3", "post-graduate"), | |
levels = c(1,2,3,4,5,6,7)) | |
# exercise habits (0 = not asked, 8 = dont know, 9 = no answer) | |
# not asked has 4 entries, dont know has 0, no answer has 26 | |
data$exercise[data$exercise %in% c(0,8,9)] <- NA | |
data$exercise <- factor(data$exercise, | |
labels = c("practically none", "sometimes" ,"once a week", | |
"twice a week", "at least twice"), | |
levels = c(1,2,3,4,5)) | |
# normal hours of work per week | |
data$work_norm <- factor(data$work_norm, | |
labels = c("1-19hr", "20-34hr", "35-97hr", | |
"farmer or self-employed", "other/doesntwork/dontknow"), | |
levels = c(1,2,3,4,5)) | |
# exercise habits (98 = dont know, 99 = no answer) | |
data$work_week[data$work_week %in% c(98,99)] <- NA | |
# income (999999 = missing) | |
data$inc_hh[data$inc_hh == 999999] <- NA | |
data$inc_salary[data$inc_salary == 999999] <- NA | |
data$inc_cap[data$inc_cap == 999999] <- NA | |
data$inc_work[data$inc_work == 999999] <- NA | |
data$inc_pens[data$inc_pens == 999999] <- NA | |
return(data) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment