Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save RandomCriticalAnalysis/16ef913d12a98841d53d7fbd0e57abc6 to your computer and use it in GitHub Desktop.
Save RandomCriticalAnalysis/16ef913d12a98841d53d7fbd0e57abc6 to your computer and use it in GitHub Desktop.
Snippet to create correlation plot by state between school test scores and per pupil fiscal measures
library(dplyr)
library(reshape2)
library(ggplot2)
DataPath="." # you may need to update this if not working from same path
setwd(DataPath)
# Data from version 2.0 of covariates and pooled score estimates from SEDA
# https://cepa.stanford.edu/seda/data-archive
# https://stacks.stanford.edu/file/druid:db586ns4974/SEDA_geodist_pool_CS_v20.csv
sedaScores = read.csv("SEDA_geodist_pool_CS_v20.csv",stringsAsFactors = F)
# https://stacks.stanford.edu/file/druid:db586ns4974/SEDA_cov_geodist_pool_v20.csv
# (This one is pooled by across years and grades)
sedaCovars=read.csv("SEDA_cov_geodist_pool_v20.csv",stringsAsFactors = F)
# 1: merge scores and covariate data
# 2: retain only rows with mean for all students (not disaggregated by race/ethnicity)
# 3: duplicate some columns with easier to find name and some combinations
sedaMerged = merge(sedaScores,sedaCovars,by='leaidC') %>%
filter(
subgroup=='all'
) %>%
mutate(
score=mn_avg_ol,
expenditure_total = ppexp_tot,
expenditure_instructional = ppexp_inst,
revenue=pprev_tot,
state=stateabb.y,
enrollment=totenrl,
pct_white_or_asian =perasn+perwht,
pct_underrepresented_minority=perblk + perind + perhsp,
median_income=inc50all,
pct_ba_or_higher = baplus_all,
pct_single_mom=singmom_all,
SES=sesall,
pct_special_ed = perspeced,
median_income_10k = median_income / 10000
)
# I included some extra variables I'm not using in this brief example but have used for related analysis
# 'melt' selected expenditure measures so that we can analyze them quickly via grouping
sedaMelted = sedaMerged %>%
mutate(
log_income=log(inc50all),
`total expenditure per pupil`=expenditure_total,
`instructional expenditure per pupil`=expenditure_instructional,
`revenue per pupil`=revenue
) %>%
melt(
measure.vars=c(
'total expenditure per pupil',
'instructional expenditure per pupil',
'revenue per pupil'
)
)
# group by state and fiscal variables
sedaCors = sedaMelted %>%
group_by(state,variable) %>%
mutate(
n=n()
) %>%
filter(
n > 3 # exclude groups with fewer than 3 rows since this causes errors
# in this case, excludes groups with DC as a 'state' since it only 1 school district
) %>%
summarise(
cor=cor.test(value,score)$estimate,
cor_lo=cor.test(value,score)$conf.int[1],
cor_hi=cor.test(value,score)$conf.int[2],
enrollment=sum(enrollment,na.rm=T)
) %>%
ungroup()
sedaCors %>%
group_by(variable) %>%
mutate(
# create factor for state, ordered by mean correlation
state_lab=reorder(state,cor,mean,na.rm=T)
) %>%
ungroup() %>% {
ggplot(.,aes(cor,state_lab,color=variable)) +
geom_vline(xintercept=0) +
geom_point(size=3) +
geom_errorbarh(aes(xmin=cor_lo,xmax=cor_hi)) +
facet_wrap(~ variable,ncol=3) +
theme(legend.position='none') +
labs(
y=NULL,
x='Correlation coefficient between school $ measure and mean student test scores at district level'
) +
coord_cartesian(xlim=c(-.7,.7))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment