jknowles · June 17, 2019 15:50
diff --git a/sdp_reg_exercise_1_example_2019.R b/sdp_reg_exercise_1_example_2019.R
 ###################################################################################################
 ## Title: Regression Module Assignment 1
 ## Exploring Student Composition Effects on Test Score Growth
 ## Author: Jared E. Knowles, Civilytics Consulting
 ## Date: 6/12/2019
 ## Last Updated: 6/17/2019
 ###################################################################################################

 # ----------------------------------------------------------------------------
 # Load the data
 # You will need to change the path to point to your own version of the data file:
 load("../guided tutorial/regression_1_r/data/full_sch_data.rda")


 # ----------------------------------------------------------------------------
 # Load any packages we will need to complete this project
 # ----------------------------------------------------------------------------
 library(ggplot2) # for plotting
 library(dplyr) # for data manipulation
 library(broom) # for working with statistical models in a tidy way
 library(coefplot) # for making quick plots of regression coefficients 
 # ----------------------------------------------------------------------------
 # Recreate original model
 m1 <- lm(ss2 ~ ss1, data = sch_score)

 coefplot(m1, intercept = FALSE)
 # ----------------------------------------------------------------------------
 # Select two additional variables
 # I want look at role student with disabilities and free and reduced lunch 
 # populations play in explaining student test scores

 m2 <- lm(ss2 ~ ss1 + swd_per + frl_per, data = sch_score)
 coefplot(m2, intercept = FALSE)
 # ----------------------------------------------------------------------------
 # Note that including more variables reduces the number of available observations 
 # because of missingness. 

 nrow(m1$model) - nrow(m2$model)

 # 193 school-grade-subject observations are lost

 # Compare model fits
 glance(m1)
 glance(m2)

 # Compare coefficients and estimates
 tidy(m1)
 tidy(m2)

 # ----------------------------------------------------------------------------
 # Diagnostic 1
 # Residual versus fitted

 # Let's compare residuals vs. fitted for m1 and m2 side by side

 # Because some observations have missing values, to add them to the 
 # original data we need to use the "predict()" function

 sch_score$m1_fitted <- predict(m1, newdata = sch_score)
 sch_score$m2_fitted <- predict(m2, newdata = sch_score)
 sch_score$m1_resid <- sch_score$ss2 - sch_score$m1_fitted
 sch_score$m2_resid <- sch_score$ss2 - sch_score$m2_fitted

 ggplot(sch_score) + aes(x = m1_fitted, y = m1_resid) +
  geom_point(alpha = I(0.1)) +
  geom_smooth(method = "lm") + theme_bw() +
  labs(title = "Residual Plot for Original Model")


 ggplot(sch_score) + aes(x = m2_fitted, y = m2_resid) +
  geom_point(alpha = I(0.1)) +
  geom_smooth(method = "lm") + theme_bw() +
  labs(title = "Residual Plot for Proposed Model")

 # Plot residuals and their variation by class size
 # Residuals by class size

 ggplot(sch_score, aes(x = n1, y = m1_resid)) +
  geom_point(alpha = I(0.1)) +
  geom_smooth() +
  theme_bw() +
  labs(title = "Model Residuals Plotted Against Class Size")

 ggplot(sch_score, aes(x = n1, y = m2_resid)) +
  geom_point(alpha = I(0.1)) +
  geom_smooth() +
  theme_bw() +
  labs(title = "Model Residuals Plotted Against Class Size")

 table("overperform" = sch_score$std_resid >= 2, "under 100" = sch_score$n1 < 100)

 sch_score$std_resid <- rstandard(m1)
 ggplot(sch_score) + aes(x = yhat, y = std_resid) +
  geom_point(alpha = I(0.1)) +
  geom_smooth() + geom_smooth(method = "lm") + theme_bw() +
  labs(title = "Residual Plot for Proposed Model")



 # ----------------------------------------------------------------------------
 # Let's look for constant or non-constant variance in residuals by decile of fitted value

 sch_score$bin <- dplyr::ntile(sch_score$m2_fitted, 10)
 sch_score %>% group_by(bin) %>%
  summarize(error_var = var(m2_resid),
            error_sd = sd(m2_resid))

 sch_score$bin <- dplyr::ntile(sch_score$m1_fitted, 10)
 sch_score %>% group_by(bin) %>%
  summarize(error_var = var(m1_resid),
            error_sd = sd(m1_resid))


 # ----------------------------------------------------------------------------
 # Standard residuals
 # Let's evaluate which schools would stand out as exceptional high or low 
 # performers using standardized residuals

 sch_score$m1_std_resid <- rstandard(m1)
 # Manually compute the standardized residual because of missing values
 sch_score$m2_std_resid <- sch_score$m2_resid / sd(sch_score$m2_resid, na.rm = TRUE)

 # Compare original model residuals with new model standardized residuals
 # Look at which schools are standouts
 table(sch_score$m1_std_resid >= 2, sch_score$test_year)
 table(sch_score$m2_std_resid >= 2, sch_score$test_year)

 table(sch_score$m1_std_resid >= 2, sch_score$grade)
 table(sch_score$m2_std_resid >= 2, sch_score$grade)

 table(sch_score$m1_std_resid >= 2, sch_score$subject)
 table(sch_score$m2_std_resid >= 2, sch_score$subject)

 # We still are more likely to detect changes in lower grade, more likely to detect changes in 
 # math and more likely to detect schools in 2007 and 2009


 #-----------------------------------------------------------------------------
 # outlier values
 # Let's look at where outliers are on our plot

 sch_score$flag_x <- ifelse(abs(as.numeric(scale(sch_score$ss1))) >= 3, "Y", "N")
 sch_score$flag_y <- ifelse(abs(as.numeric(scale(sch_score$ss2))) >= 3, "Y", "N")
 table(x_outlier = sch_score$flag_x, y_outlier = sch_score$flag_y)

 sch_score$outlier_flag <- ifelse(sch_score$flag_x == "Y" | 
                                   sch_score$flag_y == "Y", "Yes", "No")

 ggplot(sch_score, aes(x = ss1, y = ss2, color = outlier_flag)) +
  geom_point(alpha = I(0.4)) +
  scale_color_brewer(type = "qual", palette = 2) +
  geom_smooth(aes(group = 1), method = "lm", linetype = 2, se=FALSE) +
  theme_bw()


 # ----------------------------------------------------------------------------
 # Look at largest district compared to other districts
 # m_bos - without largest district

 # Fit a model only to districts that are not the largest district
 m2_bos <- lm(formula(m2), # we can pass sample restriction to the data argument
             data = sch_score[sch_score$distid != 174, ])

 # m_174 - only largest district
 m2_174 <- lm(formula(m2), 
             data = sch_score[sch_score$distid == 174, ])

 # Review coefficients
 coef(m2_bos)
 coef(m2_174)

 # Create a variable with predictions from the largest district using the 
 # model fit to that subsample.
 # For districts that are not the largest district, use the predictions from the 
 # model fit to that sample instead. 
 sch_score$m2_bos_dist[sch_score$distid == 174] <- 
  predict(m2_174, newdata = sch_score[sch_score$distid == 174,])
 sch_score$m2_bos_dist[sch_score$distid != 174] <- 
  predict(m2_bos, newdata = sch_score[sch_score$distid != 174,])

 # For plotting, create a flag for each observation indicating whether it is 
 # the biggest district or not. 
 sch_score$bos_flag <- ifelse(sch_score$distid == 174, "No", "Yes")

 # Plot
 ggplot(sch_score, aes(x = ss1, y = ss2)) +
  geom_point(alpha = I(0.4), color = "cadetblue3") +
  scale_color_brewer(type = "qual", palette = 2) +
  geom_smooth(aes(y = m2_bos_dist, color = bos_flag, group = bos_flag), 
              linetype = 2, method = "lm") + 
  labs(title = "Comparing District 174 Model to Balance of State", 
       subtitle = "Analyzing Fitted values from M2 on Subsamples", 
       caption = "Model represented by linear average of predictions.", 
       x = "Pre-test", y = "Post-test", color = "Balance of State") +
  theme_bw()
	###################################################################################################
	## Title: Regression Module Assignment 1
	## Exploring Student Composition Effects on Test Score Growth
	## Author: Jared E. Knowles, Civilytics Consulting
	## Date: 6/12/2019
	## Last Updated: 6/17/2019
	###################################################################################################

	# ----------------------------------------------------------------------------
	# Load the data
	# You will need to change the path to point to your own version of the data file:
	load("../guided tutorial/regression_1_r/data/full_sch_data.rda")


	# ----------------------------------------------------------------------------
	# Load any packages we will need to complete this project
	# ----------------------------------------------------------------------------
	library(ggplot2) # for plotting
	library(dplyr) # for data manipulation
	library(broom) # for working with statistical models in a tidy way
	library(coefplot) # for making quick plots of regression coefficients
	# ----------------------------------------------------------------------------
	# Recreate original model
	m1 <- lm(ss2 ~ ss1, data = sch_score)

	coefplot(m1, intercept = FALSE)
	# ----------------------------------------------------------------------------
	# Select two additional variables
	# I want look at role student with disabilities and free and reduced lunch
	# populations play in explaining student test scores

	m2 <- lm(ss2 ~ ss1 + swd_per + frl_per, data = sch_score)
	coefplot(m2, intercept = FALSE)
	# ----------------------------------------------------------------------------
	# Note that including more variables reduces the number of available observations
	# because of missingness.

	nrow(m1$model) - nrow(m2$model)

	# 193 school-grade-subject observations are lost

	# Compare model fits
	glance(m1)
	glance(m2)

	# Compare coefficients and estimates
	tidy(m1)
	tidy(m2)

	# ----------------------------------------------------------------------------
	# Diagnostic 1
	# Residual versus fitted

	# Let's compare residuals vs. fitted for m1 and m2 side by side

	# Because some observations have missing values, to add them to the
	# original data we need to use the "predict()" function

	sch_score$m1_fitted <- predict(m1, newdata = sch_score)
	sch_score$m2_fitted <- predict(m2, newdata = sch_score)
	sch_score$m1_resid <- sch_score$ss2 - sch_score$m1_fitted
	sch_score$m2_resid <- sch_score$ss2 - sch_score$m2_fitted

	ggplot(sch_score) + aes(x = m1_fitted, y = m1_resid) +
	geom_point(alpha = I(0.1)) +
	geom_smooth(method = "lm") + theme_bw() +
	labs(title = "Residual Plot for Original Model")


	ggplot(sch_score) + aes(x = m2_fitted, y = m2_resid) +
	geom_point(alpha = I(0.1)) +
	geom_smooth(method = "lm") + theme_bw() +
	labs(title = "Residual Plot for Proposed Model")

	# Plot residuals and their variation by class size
	# Residuals by class size

	ggplot(sch_score, aes(x = n1, y = m1_resid)) +
	geom_point(alpha = I(0.1)) +
	geom_smooth() +
	theme_bw() +
	labs(title = "Model Residuals Plotted Against Class Size")

	ggplot(sch_score, aes(x = n1, y = m2_resid)) +
	geom_point(alpha = I(0.1)) +
	geom_smooth() +
	theme_bw() +
	labs(title = "Model Residuals Plotted Against Class Size")

	table("overperform" = sch_score$std_resid >= 2, "under 100" = sch_score$n1 < 100)

	sch_score$std_resid <- rstandard(m1)
	ggplot(sch_score) + aes(x = yhat, y = std_resid) +
	geom_point(alpha = I(0.1)) +
	geom_smooth() + geom_smooth(method = "lm") + theme_bw() +
	labs(title = "Residual Plot for Proposed Model")



	# ----------------------------------------------------------------------------
	# Let's look for constant or non-constant variance in residuals by decile of fitted value

	sch_score$bin <- dplyr::ntile(sch_score$m2_fitted, 10)
	sch_score %>% group_by(bin) %>%
	summarize(error_var = var(m2_resid),
	error_sd = sd(m2_resid))

	sch_score$bin <- dplyr::ntile(sch_score$m1_fitted, 10)
	sch_score %>% group_by(bin) %>%
	summarize(error_var = var(m1_resid),
	error_sd = sd(m1_resid))


	# ----------------------------------------------------------------------------
	# Standard residuals
	# Let's evaluate which schools would stand out as exceptional high or low
	# performers using standardized residuals

	sch_score$m1_std_resid <- rstandard(m1)
	# Manually compute the standardized residual because of missing values
	sch_score$m2_std_resid <- sch_score$m2_resid / sd(sch_score$m2_resid, na.rm = TRUE)

	# Compare original model residuals with new model standardized residuals
	# Look at which schools are standouts
	table(sch_score$m1_std_resid >= 2, sch_score$test_year)
	table(sch_score$m2_std_resid >= 2, sch_score$test_year)

	table(sch_score$m1_std_resid >= 2, sch_score$grade)
	table(sch_score$m2_std_resid >= 2, sch_score$grade)

	table(sch_score$m1_std_resid >= 2, sch_score$subject)
	table(sch_score$m2_std_resid >= 2, sch_score$subject)

	# We still are more likely to detect changes in lower grade, more likely to detect changes in
	# math and more likely to detect schools in 2007 and 2009


	#-----------------------------------------------------------------------------
	# outlier values
	# Let's look at where outliers are on our plot

	sch_score$flag_x <- ifelse(abs(as.numeric(scale(sch_score$ss1))) >= 3, "Y", "N")
	sch_score$flag_y <- ifelse(abs(as.numeric(scale(sch_score$ss2))) >= 3, "Y", "N")
	table(x_outlier = sch_score$flag_x, y_outlier = sch_score$flag_y)

	sch_score$outlier_flag <- ifelse(sch_score$flag_x == "Y" \|
	sch_score$flag_y == "Y", "Yes", "No")

	ggplot(sch_score, aes(x = ss1, y = ss2, color = outlier_flag)) +
	geom_point(alpha = I(0.4)) +
	scale_color_brewer(type = "qual", palette = 2) +
	geom_smooth(aes(group = 1), method = "lm", linetype = 2, se=FALSE) +
	theme_bw()


	# ----------------------------------------------------------------------------
	# Look at largest district compared to other districts
	# m_bos - without largest district

	# Fit a model only to districts that are not the largest district
	m2_bos <- lm(formula(m2), # we can pass sample restriction to the data argument
	data = sch_score[sch_score$distid != 174, ])

	# m_174 - only largest district
	m2_174 <- lm(formula(m2),
	data = sch_score[sch_score$distid == 174, ])

	# Review coefficients
	coef(m2_bos)
	coef(m2_174)

	# Create a variable with predictions from the largest district using the
	# model fit to that subsample.
	# For districts that are not the largest district, use the predictions from the
	# model fit to that sample instead.
	sch_score$m2_bos_dist[sch_score$distid == 174] <-
	predict(m2_174, newdata = sch_score[sch_score$distid == 174,])
	sch_score$m2_bos_dist[sch_score$distid != 174] <-
	predict(m2_bos, newdata = sch_score[sch_score$distid != 174,])

	# For plotting, create a flag for each observation indicating whether it is
	# the biggest district or not.
	sch_score$bos_flag <- ifelse(sch_score$distid == 174, "No", "Yes")

	# Plot
	ggplot(sch_score, aes(x = ss1, y = ss2)) +
	geom_point(alpha = I(0.4), color = "cadetblue3") +
	scale_color_brewer(type = "qual", palette = 2) +
	geom_smooth(aes(y = m2_bos_dist, color = bos_flag, group = bos_flag),
	linetype = 2, method = "lm") +
	labs(title = "Comparing District 174 Model to Balance of State",
	subtitle = "Analyzing Fitted values from M2 on Subsamples",
	caption = "Model represented by linear average of predictions.",
	x = "Pre-test", y = "Post-test", color = "Balance of State") +
	theme_bw()