Last active
February 17, 2021 02:09
-
-
Save BioSciEconomist/501fd693c26d2b1796eda35929804cb9 to your computer and use it in GitHub Desktop.
Example state level synthetic controls
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex state level synthetic controls.R | |
# | DATE: 2/12/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: This code builds some intuition for how synthetic control methods work | |
# | using simulated data synthetic controls are created for a treatment state | |
# | placebo plots visualize uncertainty/rarity of the treatment | |
# *---------------------------------------------------------------- | |
# Code based on Synth: An R Package for Synthetic Control Methods in Comparative Case Studies | |
# Journal of Statistical Software June 2011, Volume 42, Issue 13 | |
# See also: https://cran.r-project.org/web/packages/Synth/Synth.pdf | |
# additionally following this simulation see example code from: https://mixtape.scunning.com/synthetic-control.html | |
rm(list=ls()) # get rid of any existing data | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
#------------------------------------------- | |
# simulate our state panel data | |
#------------------------------------------- | |
years <- c(rep(1990:1998,20)) | |
ID <- c(rep(1,9),rep(2,9),rep(3,9),rep(4,9),rep(5,9),rep(6,9),rep(7,9),rep(8,9), | |
rep(9,9),rep(10,9),rep(11,9),rep(12,9),rep(13,9),rep(14,9),rep(15,9), | |
rep(16,9),rep(17,9),rep(18,9),rep(19,9),rep(20,9)) | |
state <- c(rep("CA",9),rep("CO",9),rep("CT",9),rep("DW",9),rep("FL",9), | |
rep("TN",9),rep("KY",9),rep("GA",9),rep("NC",9),rep("SC",9),rep("LA",9),rep("TX",9), | |
rep("AR",9),rep("IL",9),rep("IN",9),rep("MO",9),rep("IA",9),rep("OK",9),rep("NE",9), | |
rep("AL",9)) | |
df <- data.frame(ID,state,years) | |
#------------------------------------ | |
# create some random covariates | |
#------------------------------------ | |
# unemployment | |
ky <- c(runif(9,.03,.05)) | |
ca <- c(runif(9,.02,.04)) | |
co <- c(runif(9,.03,.04)) | |
ct <- c(runif(9,.025,.03)) | |
dw <- c(runif(9,.03,.04)) | |
fl <- c(runif(9,.03,.035)) | |
tn <- c(runif(9,.03,.05)) | |
ga <- c(runif(9,.03,.05)) | |
nc <- c(runif(9,.03,.05)) | |
sc <- c(runif(9,.03,.05)) | |
la <- c(runif(9,.04,.055)) | |
tx <- c(runif(9,.02,.03)) | |
ar <- c(runif(9,.05,.06)) | |
il <- c(runif(9,.025,.03)) | |
ind <- c(runif(9,.03,.05)) | |
mo <- c(runif(9,.03,.035)) | |
ia <- c(runif(9,.03,.035)) | |
ok <- c(runif(9,.03,.05)) | |
ne <- c(runif(9,.03,.045)) | |
al <- c(runif(9,.04,.055)) | |
UN <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al) | |
# education % w/BS degree | |
ky <- c(runif(9,.21,.24)) | |
ca <- c(runif(9,.24,.26)) | |
co <- c(runif(9,.22,.25)) | |
ct <- c(runif(9,.28,.33)) | |
dw <- c(runif(9,.27,.30)) | |
fl <- c(runif(9,.23,.24)) | |
tn <- c(runif(9,.21,.24)) | |
ga <- c(runif(9,.21,.24)) | |
nc <- c(runif(9,.21,.24)) | |
sc <- c(runif(9,.21,.23)) | |
la <- c(runif(9,.13,.15)) | |
tx <- c(runif(9,.20,.23)) | |
ar <- c(runif(9,.15,.18)) | |
il <- c(runif(9,.27,.29)) | |
ind <- c(runif(9,.26,.28)) | |
mo <- c(runif(9,.18,.21)) | |
ia <- c(runif(9,.20,.21)) | |
ok <- c(runif(9,.21,.22)) | |
ne <- c(runif(9,.21,.22)) | |
al <- c(runif(9,.15,.17)) | |
college <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al) | |
# restuarants per capita | |
ky <- c(runif(9,4,5)) | |
ca <- c(runif(9,2,3)) | |
co <- c(runif(9,3,4)) | |
ct <- c(runif(9,2,3)) | |
dw <- c(runif(9,3,4)) | |
fl <- c(runif(9,5,6)) | |
tn <- c(runif(9,4,5)) | |
ga <- c(runif(9,4,5)) | |
nc <- c(runif(9,4,5)) | |
sc <- c(runif(9,3.5,4)) | |
la <- c(runif(9,6,7)) | |
tx <- c(runif(9,5,6)) | |
ar <- c(runif(9,3,4)) | |
il <- c(runif(9,2,3)) | |
ind <- c(runif(9,2,3)) | |
mo <- c(runif(9,4,5)) | |
ia <- c(runif(9,3,4)) | |
ok <- c(runif(9,2,3)) | |
ne <- c(runif(9,2,3)) | |
al <- c(runif(9,5,6)) | |
fastfood <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al) | |
df <- data.frame(ID,state,years,UN,college,fastfood) | |
# simulate cost outcome as a function of controls | |
df$cost <- (df$UN*.5 + df$college*.25 + .75*df$fastfood)*1000 | |
# modify KY cost representing an intervention from 1995-98 | |
df$cost <- ifelse(df$state =='KY' & df$years >= 1995,df$cost*.5,df$cost) | |
# modify a few states to look more like KY | |
df$cost<- ifelse(df$state == 'KY', df$cost, | |
ifelse(df$state == 'TN' & df$years < 1995,df$cost[df$state == 'KY'], | |
ifelse(df$state == 'GA' & df$years < 1995,df$cost[df$state == 'KY'], | |
ifelse(df$state == 'NC' & df$years < 1995,df$cost[df$state == 'KY'], | |
df$cost)))) | |
#---------------------------------------------- | |
# create synthetic controls for treatment (KY) | |
#---------------------------------------------- | |
### Methodology/Estimation | |
# Synthetic control methods involve the construction of synthetic control units as convex | |
# combinations of multiple control units. | |
# To construct our synthetic control unit we define a vector of weights W | |
# Each W then represents one particular weighted average of control units and therefore one potential synthetic control unit. | |
# To create the most similar synthetic control unit, the synth() function chooses the vector W∗ | |
# to minimize a distance, kX1 − X0Wk, between X1 and X0W, subject to the weight constraints. | |
# The V matrix is introduced to allow different weights to the variables in X0 and X1 depending | |
# on their predictive power on the outcome. An optimal choice of V assigns weights that | |
# minimize the mean square error of the synthetic control estimator, that is the expectation of | |
# (Y1 − Y0W∗)'(Y1 − Y0W∗) | |
# In this procedure a V∗ is chosen among all positive definite and diagonal matrices such that | |
# the mean squared prediction error (MSPE) of the outcome variable is minimized over some | |
# set of pre-intervention periods. | |
# IMPORTANT NOTES ABOUT THE REQ DATA STRUCTURE | |
# dataset is organized in standard (long) panel-data format, with variables extending | |
# across the columns and the rows sorted first by region and then by time-period.10 A name | |
# (character-string) and number is provided for each region.11 At least one of these two types | |
# of unit-identifiers is required for Synth | |
# The first step is to reorganize the panel dataset into an appropriate format that is suitable | |
# for the main estimator function synth(). At a minimum, synth() requires as inputs the four | |
# data matrices X1, X0, Z1, and Z0 that are needed to construct a synthetic control unit. | |
library(Synth) # load Synth package | |
df$state = as.character(df$state) # Synth requires our unit name variable to be character format | |
# run Synth's data prep function | |
dataprep.out= | |
dataprep( | |
foo = df, | |
predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units | |
predictors.op = "mean", # predictors will be aggregated via averaging | |
dependent = "cost", # our outcome variable | |
unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level) | |
time.variable = "years", # our panel data is tracked by year in our file | |
# special.predictors lets you control the time periods for certain predictor variables | |
# some measures are only available for certain years | |
# special.predictors = list( | |
# list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable | |
# list("UN", 1990:1995, "mean") , # use theese year values for X1 in the pre period as a 'matching' variable | |
# list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable | |
# list("fastfood", 1990:1995, "mean") # use theese year values for X3 in the pre period as a 'matching' variable | |
# ), | |
treatment.identifier = 7, # indicates KY is our treatment group | |
controls.identifier = c(1:6,8:20), # these states are part of our control pool which will be weighted to create synthetic controls | |
time.predictors.prior = c(1990:1994), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged | |
time.optimize.ssr = c(1990:1994), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized | |
unit.names.variable = "state", # character string identifying the column with the names of the units. This variable has to be of mode character. | |
time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted | |
) | |
print(dataprep.out$X0) # check | |
print(dataprep.out$X1) # check | |
print(dataprep.out$Z1) # KY outcomes for the pre-intervention period | |
print(dataprep.out$Z0) # control outcomes for the pre-intervention period | |
synth.out = synth(dataprep.out) # create synthetic controls | |
#----------------------------------- | |
# some descriptive results | |
#------------------------------------ | |
# pre built tables from synth objects | |
synth.tables <- synth.tab(dataprep.res = dataprep.out,synth.res = synth.out) | |
# comparing pre-treatment predictor values for the treated unit, the synthetic control unit, and all the units in the sample | |
synth.tables$tab.pred[1:3, ] # check balance across treated and control for pre-period predictors | |
# view control unit weights | |
synth.tables$tab.w | |
# calculate gaps or differencesin treatment and synthetic control | |
gaps <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w) | |
gaps[6:9, 1] # intervention years | |
gaps[1:5, 1] # pre period years | |
#--------------------------------- | |
# visualize outcome trend | |
#--------------------------------- | |
# plot treatment vs synthetic control outcomes trend | |
path.plot(synth.res = synth.out, dataprep.res = dataprep.out, | |
Ylab = "made up outcome", Xlab = "year", | |
Legend = c("KY","Synthetic KY"), | |
Legend.position = "bottomright") | |
#------------------------------------ | |
# visualize impact (gaps) | |
#------------------------------------ | |
# gap plot | |
gaps.plot(synth.res = synth.out, dataprep.res = dataprep.out, | |
Ylab = "costs", Xlab = "year", | |
Ylim = c(-2000, 1000), Main = NA) | |
#----------------------------------------------- | |
# inference via placebo plot | |
#------------------------------------------------ | |
# These tests involve applying the synthetic control method after reassigning the intervention in the | |
# data to units and periods where the intervention did not occur | |
# Abadie et al. (2010) describe how synthetic control methods facilitate inferential techniques | |
# akin to permutation tests-so-called placebo studies - iteratively apply the synthetic | |
# control method by randomly reassigning the intervention in time (i.e., pre-intervention dates) | |
# or across units (i.e., to control units where the intervention did not occur) to produce a set of | |
# placebo effects. Subsequently, we can compare the set of placebo effects to the effect that was | |
# estimated for the time and unit where the intervention actually occurred. This comparison | |
# is informative about the rarity of the magnitude of the treatment effect that was observed | |
# for the exposed unit. | |
# intialize matrices to stor results | |
store <- matrix(NA,length(1990:1998),20) # we will loop through all years and states | |
colnames(store) <- unique(df$state) | |
# run placebo test | |
for(iter in 1:20) | |
{ | |
dataprep.out= | |
dataprep( | |
foo = df, | |
predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units | |
predictors.op = "mean", # predictors will be aggregated via averaging | |
dependent = "cost", # our outcome variable | |
unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level) | |
time.variable = "years", # our panel data is tracked by year in our file | |
# special.predictors lets you control the time periods for certain predictor variables | |
# in addition to the predictors to create our synthetic control units (some variables may only have measures for certain years) | |
# special.predictors = list( | |
# list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable | |
# list("UN", 1990:1995, "mean") , # use theese year values for X1 in the pre period as a 'matching' variable | |
# list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable | |
# list("fastfood", 1990:1995, "mean") # use theese year values for X3 in the pre period as a 'matching' variable | |
# ), | |
treatment.identifier = iter, # this is the iterated placebo treatment | |
controls.identifier = c(1:20)[-iter],# exclude this iteration's 'placebo treatment', remaining units will be controls | |
time.predictors.prior = c(1990:1995), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged | |
time.optimize.ssr = c(1990:1995), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized | |
unit.names.variable = "state", # character string identifying the column with the names of the units. This variable has to be of mode character. | |
time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted | |
) | |
# run synth | |
synth.out <- synth( | |
data.prep.obj = dataprep.out, | |
method = "BFGS" | |
) | |
# store gaps | |
store[,iter] <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w) | |
} | |
#------------------------------------------ | |
# visualization | |
#------------------------------------------ | |
data <- store | |
rownames(data) <- 1990:1998 | |
# Set bounds in gaps data | |
gap.start <- 1 | |
gap.end <- nrow(data) | |
years <- 1990:1998 | |
gap.end.pre <- which(rownames(data)=="1994") | |
# MSPE Pre-Treatment | |
mse <- apply(data[gap.start:gap.end.pre,]^2,2,mean) | |
ky.mse <- as.numeric(mse[7]) # save MSPE for treatment | |
# Exclude states with 5 times higher MSPE than KY (treatment) (I have seen other multiples used like 2x) | |
data <- data[, mse < 5*ky.mse] | |
Cex.set <- .95 | |
# Plot | |
plot(years,data[gap.start:gap.end,which(colnames(data)=="KY")], | |
ylim=c(-2000,2000),xlab="year", | |
xlim=c(1990,1998),ylab="gap in costs", | |
type="l",lwd=2,col="black", | |
xaxs="i",yaxs="i") | |
# Add lines for control states | |
for (i in 1:ncol(data)) { lines(years,data[gap.start:gap.end,i],col="gray") } | |
## Add treatment Line | |
lines(years,data[gap.start:gap.end,which(colnames(data)=="KY")],lwd=2,col="black") | |
# Add grid | |
legend("bottomright",legend=c("treatment","control regions"), | |
lty=c(1,1),col=c("black","gray"),lwd=c(2,1),cex=.8) | |
#arrows(1995,-1.5,1995.5,-1.5,col="black",length=.1) | |
text(1994.75,-2.5,"Test Start",cex=Cex.set) | |
abline(v=1994) | |
abline(v=1998) | |
#----------------------------------- | |
# calculate exact p-values based on ranked mspe | |
#----------------------------------- | |
# extract and transform gaps data / convert rownames to years | |
tmp <- data.frame(data) | |
years <- rownames(tmp) | |
rownames(tmp) <- NULL | |
tmp <- cbind(years,tmp) | |
tmp$years <- as.numeric((tmp$years)) | |
head(tmp) | |
names(tmp) | |
# calculaute pre treatmnet mspe | |
pre <- tmp[tmp$years <= 1994,] | |
pre <- pre[,!(names(pre) %in% c("years"))] | |
mspe_pre <- apply(pre[1:5,]^2,2,mean) # this squares the gaps and averages for each state | |
# calculate post treatment mspe | |
post <- tmp[tmp$years > 1994,] | |
post <- post[,!(names(post) %in% c("years"))] | |
mspe_post <- apply(post[1:4,]^2,2,mean) # this squares the gaps and averages for each state | |
# calculate the ratio of ppst pre mspe | |
tmp2 <- data.frame(mspe_post / mspe_pre) | |
# transform | |
states <- rownames(tmp2) | |
rownames(tmp2) <- NULL | |
tmp2 <- cbind(states,tmp2) | |
head(tmp2) | |
names(tmp2) | |
#sort | |
tmp2 <- tmp2[order(-tmp2$mspe_post.mspe_pre),] | |
tmp2$rank <- seq(1:NROW(tmp2)) | |
# calculate exact p-value based on our treatment unit's ranked mspe | |
tmp2$rank[tmp2$states == "KY"]/NROW(tmp2) | |
# | |
# use SCTools to calculate p-value from randomization inference | |
# | |
#!!!!!!!!!!!!! NOTE: run this after original data prep function not the placebo runs above | |
## Test how extreme was the observed treatment effect given the placebos: | |
library(SCtools) | |
tdf <- generate.placebos(dataprep.out,synth.out, Sigf.ipop = 2) | |
ratio <- mspe.test(tdf) | |
ratio$p.val | |
mspe.plot(tdf, discard.extreme = TRUE) | |
#--------------------------------------------------------- | |
# synthtetic controls example from: https://mixtape.scunning.com/synthetic-control.html | |
#--------------------------------------------------------- | |
library(tidyverse) | |
library(haven) | |
library(Synth) | |
library(devtools) | |
if(!require(SCtools)) devtools::install_github("bcastanho/SCtools") | |
library(SCtools) | |
read_data <- function(df) | |
{ | |
full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/", | |
df, sep = "") | |
df <- read_dta(full_path) | |
return(df) | |
} | |
texas <- read_data("texas.dta") %>% | |
as.data.frame(.) | |
dataprep_out <- dataprep( | |
foo = texas, | |
predictors = c("poverty", "income"), | |
predictors.op = "mean", | |
time.predictors.prior = 1985:1993, | |
special.predictors = list( | |
list("bmprison", c(1988, 1990:1992), "mean"), | |
list("alcohol", 1990, "mean"), | |
list("aidscapita", 1990:1991, "mean"), | |
list("black", 1990:1992, "mean"), | |
list("perc1519", 1990, "mean")), | |
dependent = "bmprison", | |
unit.variable = "statefip", | |
unit.names.variable = "state", | |
time.variable = "year", | |
treatment.identifier = 48, | |
controls.identifier = c(1,2,4:6,8:13,15:42,44:47,49:51,53:56), | |
time.optimize.ssr = 1985:1993, | |
time.plot = 1985:2000 | |
) | |
synth_out <- synth(data.prep.obj = dataprep_out) | |
# | |
# visualization - trend and gaps | |
# | |
path.plot(synth_out, dataprep_out) | |
gaps.plot(synth_out, dataprep_out) | |
# | |
# visualization - placebo plots | |
# | |
placebos <- generate.placebos(dataprep_out, synth_out, Sigf.ipop = 3) | |
plot_placebos(placebos) | |
mspe.plot(placebos, discard.extreme = TRUE, mspe.limit = 1, plot.hist = TRUE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment