Created
September 16, 2018 14:33
-
-
Save acoppock/1e19e947cd114bf9dac847fa4a39513e to your computer and use it in GitHub Desktop.
Gelman and Hill Chapter 3 Using ggplot and estimatr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Gelman and Hill 2007: Chapter 3 Regression Examples | |
# Using ggplot and estimatr | |
rm(list = ls()) | |
# Uncomment to install | |
# install.packages("ggplot2") | |
# install.packges("haven") | |
# install.pacakges("estimatr") | |
library(ggplot2) | |
library(haven) | |
library(estimatr) | |
# Data and example are from gelman and hill (2007), a subsample from the National Longitudinal Survey of Youth. | |
# I downloaded from here http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/ | |
kid <- read_dta("http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta") | |
# Overriding goal is to describe the variable "kid_score" and how it might vary with mom_hs and mom_iq | |
# let's look at the variable itself | |
ggplot(kid, aes(kid_score)) + geom_histogram(bins = 30) | |
# what's a good single number summary? | |
# MSE demonstration ------------------------------------------------------- | |
proposals <- seq(50, 150, by = 0.1) | |
mses <- rep(NA, length(proposals)) | |
for (i in 1:length(proposals)){ | |
mses[i] <- mean((proposals[i] - kid$kid_score)^2) | |
} | |
proposals[which.min(mses)] | |
# really close to | |
mean(kid$kid_score) | |
# which is to say, the mean is a "good" single number summary because it has | |
# the lowest mean squared error | |
# Setting up a predictions data.frame ------------------------------------- | |
prediction_df <- | |
data.frame(mom_hs = c(0, 0, 1, 1), | |
mom_iq = c(71, 140, 71, 140)) | |
# plot 1: kid_score ~ 1 --------------------------------------------------- | |
g_1 <- ggplot(kid, aes(y = kid_score, x = 1)) + geom_point() | |
g_1 | |
# the regression | |
fit_1 <- lm_robust(kid_score ~ 1, data = kid) | |
fit_1 | |
# comment: this is the same as the mean | |
prediction_df$pred_1 <- predict(fit_1, newdata = prediction_df) | |
# add the summary on top | |
g_1 + geom_point(data = prediction_df, aes(y = pred_1), size = 6, color = "red") | |
# plot 2: kid_score ~ mom_hs ---------------------------------------------- | |
g_2 <- ggplot(kid, aes(y = kid_score, x = mom_hs)) + geom_point() | |
g_2 | |
# the regression | |
fit_2 <- lm_robust(kid_score ~ mom_hs, data = kid) | |
fit_2 | |
# comment: this is the MSE-minimizing two number summary, using mom_hs! | |
prediction_df$pred_2 <- predict(fit_2, newdata = prediction_df) | |
# add the summary on top | |
g_2 + geom_line(data = prediction_df, aes(y = pred_2), color = "red", size = 6) | |
# could also do this: | |
g_2 + stat_smooth(method = "lm_robust") | |
# Plot 3: kid_score ~ mom_iq ---------------------------------------------- | |
g_3 <- ggplot(kid, aes(y = kid_score, x = mom_iq)) + geom_point() | |
g_3 | |
# the regression | |
fit_3 <- lm_robust(kid_score ~ mom_iq, data = kid) | |
fit_3 | |
# comment: the MSE-minimizing 2-number summary, using mom_iq | |
prediction_df$pred_3 <- predict(fit_3, newdata = prediction_df) | |
# add the summary on top | |
g_3 + geom_line(data = prediction_df, aes(y = pred_3), color = "red", size = 2) | |
# could also do | |
g_3 + stat_smooth(method = "lm_robust") | |
# Plot 4: kid_score ~ mom_hs + mom_iq ------------------------------------- | |
g_4 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point() | |
g_4 | |
# the regression | |
fit_4 <- lm_robust(kid_score ~ mom_hs + mom_iq, data = kid) | |
fit_4 | |
prediction_df$pred_4 <- predict(fit_4, newdata = prediction_df) | |
g_4 + geom_line(data = prediction_df, aes(y = pred_4), size = 2) | |
# hard to use stat_smooth for this one... | |
# Plot 5: kid_score ~ mom_hs + mom_iq + mom_hs*mom_iq --------------------- | |
g_5 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point() | |
g_5 | |
# the regression | |
fit_5 <- lm_robust(kid_score ~ mom_hs + mom_iq + mom_hs * mom_iq, data = kid) | |
fit_5 | |
prediction_df$pred_5 <- predict(fit_5, newdata = prediction_df) | |
g_5 + geom_line(data = prediction_df, aes(y = pred_5), size = 2) | |
# can do stat_smooth here though... | |
g_5 + stat_smooth(method = "lm_robust", fullrange = TRUE) | |
# Summary ----------------------------------------------------------------- | |
# We ran five models. Each makes *different* predictions for these four units: | |
prediction_df | |
# each of the predictions corresponds to a different descriptive summary |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment