Created
June 17, 2019 15:50
-
-
Save jknowles/bf0e30fbe46119809b3b7c3bc2979f4f to your computer and use it in GitHub Desktop.
Exploring student composition effects on test score growth of school average scores. Example for SDP 2019 Regression course.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################################### | |
## Title: Regression Module Assignment 1 | |
## Exploring Student Composition Effects on Test Score Growth | |
## Author: Jared E. Knowles, Civilytics Consulting | |
## Date: 6/12/2019 | |
## Last Updated: 6/17/2019 | |
################################################################################################### | |
# ---------------------------------------------------------------------------- | |
# Load the data | |
# You will need to change the path to point to your own version of the data file: | |
load("../guided tutorial/regression_1_r/data/full_sch_data.rda") | |
# ---------------------------------------------------------------------------- | |
# Load any packages we will need to complete this project | |
# ---------------------------------------------------------------------------- | |
library(ggplot2) # for plotting | |
library(dplyr) # for data manipulation | |
library(broom) # for working with statistical models in a tidy way | |
library(coefplot) # for making quick plots of regression coefficients | |
# ---------------------------------------------------------------------------- | |
# Recreate original model | |
m1 <- lm(ss2 ~ ss1, data = sch_score) | |
coefplot(m1, intercept = FALSE) | |
# ---------------------------------------------------------------------------- | |
# Select two additional variables | |
# I want look at role student with disabilities and free and reduced lunch | |
# populations play in explaining student test scores | |
m2 <- lm(ss2 ~ ss1 + swd_per + frl_per, data = sch_score) | |
coefplot(m2, intercept = FALSE) | |
# ---------------------------------------------------------------------------- | |
# Note that including more variables reduces the number of available observations | |
# because of missingness. | |
nrow(m1$model) - nrow(m2$model) | |
# 193 school-grade-subject observations are lost | |
# Compare model fits | |
glance(m1) | |
glance(m2) | |
# Compare coefficients and estimates | |
tidy(m1) | |
tidy(m2) | |
# ---------------------------------------------------------------------------- | |
# Diagnostic 1 | |
# Residual versus fitted | |
# Let's compare residuals vs. fitted for m1 and m2 side by side | |
# Because some observations have missing values, to add them to the | |
# original data we need to use the "predict()" function | |
sch_score$m1_fitted <- predict(m1, newdata = sch_score) | |
sch_score$m2_fitted <- predict(m2, newdata = sch_score) | |
sch_score$m1_resid <- sch_score$ss2 - sch_score$m1_fitted | |
sch_score$m2_resid <- sch_score$ss2 - sch_score$m2_fitted | |
ggplot(sch_score) + aes(x = m1_fitted, y = m1_resid) + | |
geom_point(alpha = I(0.1)) + | |
geom_smooth(method = "lm") + theme_bw() + | |
labs(title = "Residual Plot for Original Model") | |
ggplot(sch_score) + aes(x = m2_fitted, y = m2_resid) + | |
geom_point(alpha = I(0.1)) + | |
geom_smooth(method = "lm") + theme_bw() + | |
labs(title = "Residual Plot for Proposed Model") | |
# Plot residuals and their variation by class size | |
# Residuals by class size | |
ggplot(sch_score, aes(x = n1, y = m1_resid)) + | |
geom_point(alpha = I(0.1)) + | |
geom_smooth() + | |
theme_bw() + | |
labs(title = "Model Residuals Plotted Against Class Size") | |
ggplot(sch_score, aes(x = n1, y = m2_resid)) + | |
geom_point(alpha = I(0.1)) + | |
geom_smooth() + | |
theme_bw() + | |
labs(title = "Model Residuals Plotted Against Class Size") | |
table("overperform" = sch_score$std_resid >= 2, "under 100" = sch_score$n1 < 100) | |
sch_score$std_resid <- rstandard(m1) | |
ggplot(sch_score) + aes(x = yhat, y = std_resid) + | |
geom_point(alpha = I(0.1)) + | |
geom_smooth() + geom_smooth(method = "lm") + theme_bw() + | |
labs(title = "Residual Plot for Proposed Model") | |
# ---------------------------------------------------------------------------- | |
# Let's look for constant or non-constant variance in residuals by decile of fitted value | |
sch_score$bin <- dplyr::ntile(sch_score$m2_fitted, 10) | |
sch_score %>% group_by(bin) %>% | |
summarize(error_var = var(m2_resid), | |
error_sd = sd(m2_resid)) | |
sch_score$bin <- dplyr::ntile(sch_score$m1_fitted, 10) | |
sch_score %>% group_by(bin) %>% | |
summarize(error_var = var(m1_resid), | |
error_sd = sd(m1_resid)) | |
# ---------------------------------------------------------------------------- | |
# Standard residuals | |
# Let's evaluate which schools would stand out as exceptional high or low | |
# performers using standardized residuals | |
sch_score$m1_std_resid <- rstandard(m1) | |
# Manually compute the standardized residual because of missing values | |
sch_score$m2_std_resid <- sch_score$m2_resid / sd(sch_score$m2_resid, na.rm = TRUE) | |
# Compare original model residuals with new model standardized residuals | |
# Look at which schools are standouts | |
table(sch_score$m1_std_resid >= 2, sch_score$test_year) | |
table(sch_score$m2_std_resid >= 2, sch_score$test_year) | |
table(sch_score$m1_std_resid >= 2, sch_score$grade) | |
table(sch_score$m2_std_resid >= 2, sch_score$grade) | |
table(sch_score$m1_std_resid >= 2, sch_score$subject) | |
table(sch_score$m2_std_resid >= 2, sch_score$subject) | |
# We still are more likely to detect changes in lower grade, more likely to detect changes in | |
# math and more likely to detect schools in 2007 and 2009 | |
#----------------------------------------------------------------------------- | |
# outlier values | |
# Let's look at where outliers are on our plot | |
sch_score$flag_x <- ifelse(abs(as.numeric(scale(sch_score$ss1))) >= 3, "Y", "N") | |
sch_score$flag_y <- ifelse(abs(as.numeric(scale(sch_score$ss2))) >= 3, "Y", "N") | |
table(x_outlier = sch_score$flag_x, y_outlier = sch_score$flag_y) | |
sch_score$outlier_flag <- ifelse(sch_score$flag_x == "Y" | | |
sch_score$flag_y == "Y", "Yes", "No") | |
ggplot(sch_score, aes(x = ss1, y = ss2, color = outlier_flag)) + | |
geom_point(alpha = I(0.4)) + | |
scale_color_brewer(type = "qual", palette = 2) + | |
geom_smooth(aes(group = 1), method = "lm", linetype = 2, se=FALSE) + | |
theme_bw() | |
# ---------------------------------------------------------------------------- | |
# Look at largest district compared to other districts | |
# m_bos - without largest district | |
# Fit a model only to districts that are not the largest district | |
m2_bos <- lm(formula(m2), # we can pass sample restriction to the data argument | |
data = sch_score[sch_score$distid != 174, ]) | |
# m_174 - only largest district | |
m2_174 <- lm(formula(m2), | |
data = sch_score[sch_score$distid == 174, ]) | |
# Review coefficients | |
coef(m2_bos) | |
coef(m2_174) | |
# Create a variable with predictions from the largest district using the | |
# model fit to that subsample. | |
# For districts that are not the largest district, use the predictions from the | |
# model fit to that sample instead. | |
sch_score$m2_bos_dist[sch_score$distid == 174] <- | |
predict(m2_174, newdata = sch_score[sch_score$distid == 174,]) | |
sch_score$m2_bos_dist[sch_score$distid != 174] <- | |
predict(m2_bos, newdata = sch_score[sch_score$distid != 174,]) | |
# For plotting, create a flag for each observation indicating whether it is | |
# the biggest district or not. | |
sch_score$bos_flag <- ifelse(sch_score$distid == 174, "No", "Yes") | |
# Plot | |
ggplot(sch_score, aes(x = ss1, y = ss2)) + | |
geom_point(alpha = I(0.4), color = "cadetblue3") + | |
scale_color_brewer(type = "qual", palette = 2) + | |
geom_smooth(aes(y = m2_bos_dist, color = bos_flag, group = bos_flag), | |
linetype = 2, method = "lm") + | |
labs(title = "Comparing District 174 Model to Balance of State", | |
subtitle = "Analyzing Fitted values from M2 on Subsamples", | |
caption = "Model represented by linear average of predictions.", | |
x = "Pre-test", y = "Post-test", color = "Balance of State") + | |
theme_bw() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment