acoppock · September 16, 2018 14:33
diff --git a/gelman_and_hill_chapter_3.R b/gelman_and_hill_chapter_3.R
 # Gelman and Hill 2007: Chapter 3 Regression Examples 
 # Using ggplot and estimatr

 rm(list = ls())
 # Uncomment to install
 # install.packages("ggplot2")
 # install.packges("haven")
 # install.pacakges("estimatr")

 library(ggplot2)
 library(haven)
 library(estimatr)

 # Data and example are from gelman and hill (2007), a subsample from the National Longitudinal Survey of Youth.
 # I downloaded from here http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/
 kid <- read_dta("http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta")

 # Overriding goal is to describe the variable "kid_score" and how it might vary with mom_hs and mom_iq
 # let's look at the variable itself

 ggplot(kid, aes(kid_score)) + geom_histogram(bins = 30)

 # what's a good single number summary?

 # MSE demonstration -------------------------------------------------------

 proposals <- seq(50, 150, by = 0.1)
 mses <- rep(NA, length(proposals))
 for (i in 1:length(proposals)){
  mses[i] <- mean((proposals[i] - kid$kid_score)^2)
 }

 proposals[which.min(mses)]

 # really close to
 mean(kid$kid_score)

 # which is to say, the mean is a "good" single number summary because it has
 # the lowest mean squared error


 # Setting up a predictions data.frame -------------------------------------

 prediction_df <-
  data.frame(mom_hs = c(0, 0, 1, 1),
             mom_iq = c(71, 140, 71, 140))

 # plot 1: kid_score ~ 1 ---------------------------------------------------

 g_1 <- ggplot(kid, aes(y = kid_score, x = 1)) + geom_point()
 g_1

 # the regression
 fit_1 <- lm_robust(kid_score ~ 1, data = kid)
 fit_1

 # comment: this is the same as the mean

 prediction_df$pred_1 <- predict(fit_1, newdata = prediction_df)

 # add the summary on top
 g_1 + geom_point(data = prediction_df, aes(y = pred_1), size = 6, color = "red")

 # plot 2: kid_score ~ mom_hs ----------------------------------------------

 g_2 <- ggplot(kid, aes(y = kid_score, x = mom_hs)) + geom_point()
 g_2

 # the regression
 fit_2 <- lm_robust(kid_score ~ mom_hs, data = kid)
 fit_2

 # comment: this is the MSE-minimizing two number summary, using mom_hs!

 prediction_df$pred_2 <- predict(fit_2, newdata = prediction_df)

 # add the summary on top
 g_2 + geom_line(data = prediction_df, aes(y = pred_2), color = "red", size = 6)

 # could also do this:
 g_2 + stat_smooth(method = "lm_robust")

 # Plot 3: kid_score ~ mom_iq ----------------------------------------------

 g_3 <- ggplot(kid, aes(y = kid_score, x = mom_iq)) + geom_point()
 g_3

 # the regression
 fit_3 <- lm_robust(kid_score ~ mom_iq, data = kid)
 fit_3

 # comment: the MSE-minimizing 2-number summary, using mom_iq

 prediction_df$pred_3 <- predict(fit_3, newdata = prediction_df)

 # add the summary on top
 g_3 + geom_line(data = prediction_df, aes(y = pred_3), color = "red", size = 2)

 # could also do
 g_3 + stat_smooth(method = "lm_robust")

 # Plot 4: kid_score ~ mom_hs + mom_iq -------------------------------------

 g_4 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
 g_4

 # the regression
 fit_4 <- lm_robust(kid_score ~ mom_hs + mom_iq, data = kid)
 fit_4

 prediction_df$pred_4 <- predict(fit_4, newdata = prediction_df)

 g_4 + geom_line(data = prediction_df, aes(y = pred_4), size = 2)

 # hard to use stat_smooth for this one...

 # Plot 5: kid_score ~ mom_hs + mom_iq + mom_hs*mom_iq ---------------------

 g_5 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
 g_5

 # the regression
 fit_5 <- lm_robust(kid_score ~ mom_hs + mom_iq + mom_hs * mom_iq, data = kid)
 fit_5

 prediction_df$pred_5 <- predict(fit_5, newdata = prediction_df)

 g_5 + geom_line(data = prediction_df, aes(y = pred_5), size = 2)

 # can do stat_smooth here though...

 g_5 + stat_smooth(method = "lm_robust", fullrange = TRUE)

 # Summary -----------------------------------------------------------------

 # We ran five models. Each makes *different* predictions for these four units:

 prediction_df

 # each of the predictions corresponds to a different descriptive summary
	# Gelman and Hill 2007: Chapter 3 Regression Examples
	# Using ggplot and estimatr

	rm(list = ls())
	# Uncomment to install
	# install.packages("ggplot2")
	# install.packges("haven")
	# install.pacakges("estimatr")

	library(ggplot2)
	library(haven)
	library(estimatr)

	# Data and example are from gelman and hill (2007), a subsample from the National Longitudinal Survey of Youth.
	# I downloaded from here http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/
	kid <- read_dta("http://www.stat.columbia.edu/~gelman/arm/examples/child.iq/kidiq.dta")

	# Overriding goal is to describe the variable "kid_score" and how it might vary with mom_hs and mom_iq
	# let's look at the variable itself

	ggplot(kid, aes(kid_score)) + geom_histogram(bins = 30)

	# what's a good single number summary?

	# MSE demonstration -------------------------------------------------------

	proposals <- seq(50, 150, by = 0.1)
	mses <- rep(NA, length(proposals))
	for (i in 1:length(proposals)){
	mses[i] <- mean((proposals[i] - kid$kid_score)^2)
	}

	proposals[which.min(mses)]

	# really close to
	mean(kid$kid_score)

	# which is to say, the mean is a "good" single number summary because it has
	# the lowest mean squared error


	# Setting up a predictions data.frame -------------------------------------

	prediction_df <-
	data.frame(mom_hs = c(0, 0, 1, 1),
	mom_iq = c(71, 140, 71, 140))

	# plot 1: kid_score ~ 1 ---------------------------------------------------

	g_1 <- ggplot(kid, aes(y = kid_score, x = 1)) + geom_point()
	g_1

	# the regression
	fit_1 <- lm_robust(kid_score ~ 1, data = kid)
	fit_1

	# comment: this is the same as the mean

	prediction_df$pred_1 <- predict(fit_1, newdata = prediction_df)

	# add the summary on top
	g_1 + geom_point(data = prediction_df, aes(y = pred_1), size = 6, color = "red")

	# plot 2: kid_score ~ mom_hs ----------------------------------------------

	g_2 <- ggplot(kid, aes(y = kid_score, x = mom_hs)) + geom_point()
	g_2

	# the regression
	fit_2 <- lm_robust(kid_score ~ mom_hs, data = kid)
	fit_2

	# comment: this is the MSE-minimizing two number summary, using mom_hs!

	prediction_df$pred_2 <- predict(fit_2, newdata = prediction_df)

	# add the summary on top
	g_2 + geom_line(data = prediction_df, aes(y = pred_2), color = "red", size = 6)

	# could also do this:
	g_2 + stat_smooth(method = "lm_robust")

	# Plot 3: kid_score ~ mom_iq ----------------------------------------------

	g_3 <- ggplot(kid, aes(y = kid_score, x = mom_iq)) + geom_point()
	g_3

	# the regression
	fit_3 <- lm_robust(kid_score ~ mom_iq, data = kid)
	fit_3

	# comment: the MSE-minimizing 2-number summary, using mom_iq

	prediction_df$pred_3 <- predict(fit_3, newdata = prediction_df)

	# add the summary on top
	g_3 + geom_line(data = prediction_df, aes(y = pred_3), color = "red", size = 2)

	# could also do
	g_3 + stat_smooth(method = "lm_robust")

	# Plot 4: kid_score ~ mom_hs + mom_iq -------------------------------------

	g_4 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
	g_4

	# the regression
	fit_4 <- lm_robust(kid_score ~ mom_hs + mom_iq, data = kid)
	fit_4

	prediction_df$pred_4 <- predict(fit_4, newdata = prediction_df)

	g_4 + geom_line(data = prediction_df, aes(y = pred_4), size = 2)

	# hard to use stat_smooth for this one...

	# Plot 5: kid_score ~ mom_hs + mom_iq + mom_hs*mom_iq ---------------------

	g_5 <- ggplot(kid, aes(y = kid_score, x = mom_iq, color = mom_hs, group = mom_hs)) + geom_point()
	g_5

	# the regression
	fit_5 <- lm_robust(kid_score ~ mom_hs + mom_iq + mom_hs * mom_iq, data = kid)
	fit_5

	prediction_df$pred_5 <- predict(fit_5, newdata = prediction_df)

	g_5 + geom_line(data = prediction_df, aes(y = pred_5), size = 2)

	# can do stat_smooth here though...

	g_5 + stat_smooth(method = "lm_robust", fullrange = TRUE)

	# Summary -----------------------------------------------------------------

	# We ran five models. Each makes different predictions for these four units:

	prediction_df

	# each of the predictions corresponds to a different descriptive summary