Created
May 7, 2018 14:36
-
-
Save RandomCriticalAnalysis/16ef913d12a98841d53d7fbd0e57abc6 to your computer and use it in GitHub Desktop.
Snippet to create correlation plot by state between school test scores and per pupil fiscal measures
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(reshape2) | |
library(ggplot2) | |
DataPath="." # you may need to update this if not working from same path | |
setwd(DataPath) | |
# Data from version 2.0 of covariates and pooled score estimates from SEDA | |
# https://cepa.stanford.edu/seda/data-archive | |
# https://stacks.stanford.edu/file/druid:db586ns4974/SEDA_geodist_pool_CS_v20.csv | |
sedaScores = read.csv("SEDA_geodist_pool_CS_v20.csv",stringsAsFactors = F) | |
# https://stacks.stanford.edu/file/druid:db586ns4974/SEDA_cov_geodist_pool_v20.csv | |
# (This one is pooled by across years and grades) | |
sedaCovars=read.csv("SEDA_cov_geodist_pool_v20.csv",stringsAsFactors = F) | |
# 1: merge scores and covariate data | |
# 2: retain only rows with mean for all students (not disaggregated by race/ethnicity) | |
# 3: duplicate some columns with easier to find name and some combinations | |
sedaMerged = merge(sedaScores,sedaCovars,by='leaidC') %>% | |
filter( | |
subgroup=='all' | |
) %>% | |
mutate( | |
score=mn_avg_ol, | |
expenditure_total = ppexp_tot, | |
expenditure_instructional = ppexp_inst, | |
revenue=pprev_tot, | |
state=stateabb.y, | |
enrollment=totenrl, | |
pct_white_or_asian =perasn+perwht, | |
pct_underrepresented_minority=perblk + perind + perhsp, | |
median_income=inc50all, | |
pct_ba_or_higher = baplus_all, | |
pct_single_mom=singmom_all, | |
SES=sesall, | |
pct_special_ed = perspeced, | |
median_income_10k = median_income / 10000 | |
) | |
# I included some extra variables I'm not using in this brief example but have used for related analysis | |
# 'melt' selected expenditure measures so that we can analyze them quickly via grouping | |
sedaMelted = sedaMerged %>% | |
mutate( | |
log_income=log(inc50all), | |
`total expenditure per pupil`=expenditure_total, | |
`instructional expenditure per pupil`=expenditure_instructional, | |
`revenue per pupil`=revenue | |
) %>% | |
melt( | |
measure.vars=c( | |
'total expenditure per pupil', | |
'instructional expenditure per pupil', | |
'revenue per pupil' | |
) | |
) | |
# group by state and fiscal variables | |
sedaCors = sedaMelted %>% | |
group_by(state,variable) %>% | |
mutate( | |
n=n() | |
) %>% | |
filter( | |
n > 3 # exclude groups with fewer than 3 rows since this causes errors | |
# in this case, excludes groups with DC as a 'state' since it only 1 school district | |
) %>% | |
summarise( | |
cor=cor.test(value,score)$estimate, | |
cor_lo=cor.test(value,score)$conf.int[1], | |
cor_hi=cor.test(value,score)$conf.int[2], | |
enrollment=sum(enrollment,na.rm=T) | |
) %>% | |
ungroup() | |
sedaCors %>% | |
group_by(variable) %>% | |
mutate( | |
# create factor for state, ordered by mean correlation | |
state_lab=reorder(state,cor,mean,na.rm=T) | |
) %>% | |
ungroup() %>% { | |
ggplot(.,aes(cor,state_lab,color=variable)) + | |
geom_vline(xintercept=0) + | |
geom_point(size=3) + | |
geom_errorbarh(aes(xmin=cor_lo,xmax=cor_hi)) + | |
facet_wrap(~ variable,ncol=3) + | |
theme(legend.position='none') + | |
labs( | |
y=NULL, | |
x='Correlation coefficient between school $ measure and mean student test scores at district level' | |
) + | |
coord_cartesian(xlim=c(-.7,.7)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment