Created
October 15, 2014 18:54
-
-
Save dggoldst/a0dba9dcaec070649ffa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(dplyr) | |
AGEMIN=30 | |
AGEMAX=110 | |
NR=AGEMAX-AGEMIN+1 | |
best_ABSDEV=1e6 | |
#If you want the original data, you can get it from | |
#http://www.ssa.gov/OACT/STATS/table4c6.html | |
#And prepare it as follows. | |
#df=read.delim("lifex_2010.txt") | |
#df = df %>% | |
# filter(age>=AGEMIN & age <=AGEMAX) %>% | |
# mutate(lifex=(m_lifex+f_lifex)/2) %>% #Make a unisex life expectancy | |
# select(age,lifex) | |
#Or you can just work with the cleaned data, here: | |
df=structure(list(age = 30:110, lifex = c(49.8, 48.85, 47.9, 46.95, | |
46.005, 45.06, 44.11, 43.165, 42.225, 41.285, 40.345, 39.415, | |
38.48, 37.56, 36.64, 35.725, 34.82, 33.92, 33.025, 32.14, 31.26, | |
30.39, 29.525, 28.665, 27.81, 26.97, 26.125, 25.3, 24.47, 23.655, | |
22.84, 22.03, 21.23, 20.44, 19.655, 18.885, 18.12, 17.375, 16.635, | |
15.915, 15.2, 14.495, 13.81, 13.14, 12.475, 11.83, 11.205, 10.59, | |
10, 9.42, 8.855, 8.315, 7.79, 7.29, 6.81, 6.345, 5.91, 5.495, | |
5.1, 4.735, 4.395, 4.075, 3.785, 3.52, 3.28, 3.065, 2.875, 2.705, | |
2.55, 2.41, 2.275, 2.15, 2.025, 1.905, 1.795, 1.69, 1.585, 1.485, | |
1.385, 1.295, 1.215)), class = "data.frame", row.names = c(NA, | |
-81L), .Names = c("age", "lifex")) | |
### The best simple linear model | |
#For ages 30-110, the best fitting simple linear model is | |
#int=63.3, age=-.64 | |
bestlinear_model=lm(data=df,lifex~age) | |
### The best second-order polynomial model | |
#For ages 30-110, the best fitting simple linear model is | |
#int=92.1, | |
#age=-1.56 | |
#age^2=.007 | |
bestpoly_model=lm(data=df,lifex~age+I(age^2)) | |
### The best bi-linear model | |
#For ages 30-110, the best fitting bi-linear model has | |
#int1=73.84 slope1= -0.84, int2= 27.63, slope2=-0.25 | |
#with a new line starting at age 80 | |
errorlist=vector('list',nrow(df)) | |
for(cut in (5):(nrow(df)-5)){ | |
#There are slicker ways to do this, but this is easier to understand I think | |
#Define dummy variables for interecepts 1 and 2 and slopes 1 and 2 | |
df=mutate(df, | |
i1=c(rep(1,cut),rep(0,AGEMAX-cut-AGEMIN+1)), | |
s1=i1*age, | |
i2=c(rep(0,cut),rep(1,AGEMAX-cut-AGEMIN+1)), | |
s2=i2*age | |
) | |
#Fit a candidate model to the dummies | |
candidate_bilinear_model=lm(data=df,lifex~0+i1+s1+i2+s2) | |
#Record the coefficients and predictions for the model for selection later | |
df$bilinear=predict(candidate_bilinear_model) | |
current_ABSDEV=with(df,sum(abs(bilinear-lifex))/nrow(df)) | |
#Save the predictions of the best model | |
if(current_ABSDEV<best_ABSDEV){ | |
best_ABSDEV=current_ABSDEV | |
best_bilinear_preds = df$bilinear | |
best_cut=cut} | |
#Keep a log of the various model coefficients and errors | |
errorlist[[cut]]=data.frame(cut=cut, | |
ABSDEV=current_ABSDEV, | |
coef_i1=round(candidate_bilinear_model$coef[1],2), | |
coef_s1=round(candidate_bilinear_model$coef[2],2), | |
coef_i2=round(candidate_bilinear_model$coef[3],2), | |
coef_s2=round(candidate_bilinear_model$coef[4],2)) | |
} | |
errdat=do.call("rbind",errorlist) | |
#Plot to show we found the model with the lowest ABSDEV (in errdat at best_cut) | |
#Need to add AGEMIN to this to find the year at which to start the second line | |
p=ggplot(errdat,aes(x=cut,y=ABSDEV)) | |
p=p+geom_point() | |
p | |
ggsave(plot=p,file="ABSDEV_minimizing.png",height=4,width=4) | |
### Heuristic bi-linear model | |
#Here's an approximation with rounder / easier to apply numbers | |
#If you're under 85, it's | |
#72 minus 80% of your age | |
#Otherwise it's | |
#22 minus 20% of your age | |
heuristic_bilinear_model=Vectorize(function(age){ | |
if(age<85) {72-.8*age} | |
else {22-.2*age} | |
}) | |
#The 50-15-5 model | |
#For ages 30, 70, 90, 110 | |
#the respective life-expectancies are | |
#50, 15, 5, 0 | |
#Go forth and interpolate | |
model_50_15_5=Vectorize(function(age){ | |
if(age<70) {50-(age-30)*35/40} | |
else if(age<90) {15-(age-70)*10/20} | |
else {5-(age-90)*5/20} | |
}) | |
### Doctor's heuristic | |
#100 minus your age divided by two | |
doctors=function(age) {(100-age)/2} | |
### Make predictions for each model | |
pred_dat = data.frame( | |
Age=df$age, | |
lifex=df$lifex, | |
Model=c(rep("Best Linear",NR), | |
rep("Best Polynomial",NR), | |
rep("Best Bi-linear",NR), | |
rep("Heuristic Bilinear",NR), | |
rep("50 15 5 Heuristic",NR), | |
rep("Doctors",NR) | |
), | |
Predicted_Life_Expectancy=c(predict(bestlinear_model), | |
predict(bestpoly_model), | |
best_bilinear_preds, | |
heuristic_bilinear_model(df$age), | |
model_50_15_5(df$age), | |
doctors(df$age) | |
)) | |
###Error for each model | |
pred_dat = pred_dat %>% | |
mutate(ABSDEV=abs(lifex-Predicted_Life_Expectancy)) | |
p=ggplot(subset(pred_dat,Model %in% c("Doctors","Best Linear")), | |
aes(x=Age,y=Predicted_Life_Expectancy,group=Model,color=Model)) | |
p=p+geom_point(color="black",aes(y=lifex)) | |
p=p+geom_line(size=1.25) | |
p=p+theme(legend.position="bottom") | |
p=p+scale_x_continuous(breaks=seq(0,110,10)) | |
p=p+scale_y_continuous(breaks=seq(-10,110,10)) | |
p=p+scale_color_brewer(palette="Set1") | |
p | |
ggsave(plot=p,file="linear.png",height=4,width=6) | |
p=ggplot(subset(pred_dat,Model %in% c("Best Polynomial","Best Bi-linear")), | |
aes(x=Age,y=Predicted_Life_Expectancy,group=Model,color=Model)) | |
p=p+geom_point(color="black",aes(y=lifex)) | |
p=p+geom_line(size=1.25) | |
p=p+theme(legend.position="bottom") | |
p=p+scale_x_continuous(breaks=seq(0,110,10)) | |
p=p+scale_y_continuous(breaks=seq(-10,110,10)) | |
p=p+scale_color_brewer(palette="Dark2") | |
p | |
ggsave(plot=p,file="fitted_nonlinear.png",height=4,width=6) | |
p=ggplot(subset(pred_dat,Model %in% c("Heuristic Bilinear","50 15 5 Heuristic")), | |
aes(x=Age,y=Predicted_Life_Expectancy,group=Model,color=Model)) | |
p=p+geom_point(color="black",aes(y=lifex)) | |
p=p+geom_line(size=1.25) | |
p=p+theme(legend.position="bottom") | |
p=p+scale_x_continuous(breaks=seq(0,110,10)) | |
p=p+scale_y_continuous(breaks=seq(0,110,10)) | |
p=p+scale_color_brewer(palette="Accent") | |
p | |
ggsave(plot=p,file="heuristic_nonlinear.png",height=4,width=6) | |
mad_df = pred_dat %>% | |
group_by(Model) %>% | |
summarize(MAD=mean(ABSDEV), | |
seMAD=sqrt(var(ABSDEV)/length(ABSDEV))) %>% | |
arrange(MAD) | |
mad_df = mad_df %>% | |
mutate(Model=factor(Model,levels=mad_df$Model)) | |
p=ggplot(mad_df,aes(x=Model,y=MAD)) | |
p=p+geom_point() | |
p=p+geom_errorbar(width=.2,aes(ymin=MAD-seMAD,ymax=MAD+seMAD)) | |
p=p+theme( axis.text.x = element_text(angle=90, vjust=0.5)) | |
p=p+ggtitle("Mean Absolute Deviation of Predictions") | |
p | |
ggsave(plot=p,file="mad_predictions.png",height=4,width=4) | |
#######Try it w/ death age | |
df = df %>% mutate(deathx=age+lifex) | |
p=ggplot(df,aes(x=age,y=deathx)) | |
p=p+geom_point() | |
p=p+scale_x_continuous(breaks=seq(0,110,10)) | |
p=p+scale_y_continuous(breaks=seq(0,110,10)) | |
p |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment