Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active February 17, 2021 02:09
Show Gist options
  • Save BioSciEconomist/501fd693c26d2b1796eda35929804cb9 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/501fd693c26d2b1796eda35929804cb9 to your computer and use it in GitHub Desktop.
Example state level synthetic controls
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex state level synthetic controls.R
# | DATE: 2/12/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: This code builds some intuition for how synthetic control methods work
# | using simulated data synthetic controls are created for a treatment state
# | placebo plots visualize uncertainty/rarity of the treatment
# *----------------------------------------------------------------
# Code based on Synth: An R Package for Synthetic Control Methods in Comparative Case Studies
# Journal of Statistical Software June 2011, Volume 42, Issue 13
# See also: https://cran.r-project.org/web/packages/Synth/Synth.pdf
# additionally following this simulation see example code from: https://mixtape.scunning.com/synthetic-control.html
rm(list=ls()) # get rid of any existing data
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
#-------------------------------------------
# simulate our state panel data
#-------------------------------------------
years <- c(rep(1990:1998,20))
ID <- c(rep(1,9),rep(2,9),rep(3,9),rep(4,9),rep(5,9),rep(6,9),rep(7,9),rep(8,9),
rep(9,9),rep(10,9),rep(11,9),rep(12,9),rep(13,9),rep(14,9),rep(15,9),
rep(16,9),rep(17,9),rep(18,9),rep(19,9),rep(20,9))
state <- c(rep("CA",9),rep("CO",9),rep("CT",9),rep("DW",9),rep("FL",9),
rep("TN",9),rep("KY",9),rep("GA",9),rep("NC",9),rep("SC",9),rep("LA",9),rep("TX",9),
rep("AR",9),rep("IL",9),rep("IN",9),rep("MO",9),rep("IA",9),rep("OK",9),rep("NE",9),
rep("AL",9))
df <- data.frame(ID,state,years)
#------------------------------------
# create some random covariates
#------------------------------------
# unemployment
ky <- c(runif(9,.03,.05))
ca <- c(runif(9,.02,.04))
co <- c(runif(9,.03,.04))
ct <- c(runif(9,.025,.03))
dw <- c(runif(9,.03,.04))
fl <- c(runif(9,.03,.035))
tn <- c(runif(9,.03,.05))
ga <- c(runif(9,.03,.05))
nc <- c(runif(9,.03,.05))
sc <- c(runif(9,.03,.05))
la <- c(runif(9,.04,.055))
tx <- c(runif(9,.02,.03))
ar <- c(runif(9,.05,.06))
il <- c(runif(9,.025,.03))
ind <- c(runif(9,.03,.05))
mo <- c(runif(9,.03,.035))
ia <- c(runif(9,.03,.035))
ok <- c(runif(9,.03,.05))
ne <- c(runif(9,.03,.045))
al <- c(runif(9,.04,.055))
UN <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)
# education % w/BS degree
ky <- c(runif(9,.21,.24))
ca <- c(runif(9,.24,.26))
co <- c(runif(9,.22,.25))
ct <- c(runif(9,.28,.33))
dw <- c(runif(9,.27,.30))
fl <- c(runif(9,.23,.24))
tn <- c(runif(9,.21,.24))
ga <- c(runif(9,.21,.24))
nc <- c(runif(9,.21,.24))
sc <- c(runif(9,.21,.23))
la <- c(runif(9,.13,.15))
tx <- c(runif(9,.20,.23))
ar <- c(runif(9,.15,.18))
il <- c(runif(9,.27,.29))
ind <- c(runif(9,.26,.28))
mo <- c(runif(9,.18,.21))
ia <- c(runif(9,.20,.21))
ok <- c(runif(9,.21,.22))
ne <- c(runif(9,.21,.22))
al <- c(runif(9,.15,.17))
college <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)
# restuarants per capita
ky <- c(runif(9,4,5))
ca <- c(runif(9,2,3))
co <- c(runif(9,3,4))
ct <- c(runif(9,2,3))
dw <- c(runif(9,3,4))
fl <- c(runif(9,5,6))
tn <- c(runif(9,4,5))
ga <- c(runif(9,4,5))
nc <- c(runif(9,4,5))
sc <- c(runif(9,3.5,4))
la <- c(runif(9,6,7))
tx <- c(runif(9,5,6))
ar <- c(runif(9,3,4))
il <- c(runif(9,2,3))
ind <- c(runif(9,2,3))
mo <- c(runif(9,4,5))
ia <- c(runif(9,3,4))
ok <- c(runif(9,2,3))
ne <- c(runif(9,2,3))
al <- c(runif(9,5,6))
fastfood <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)
df <- data.frame(ID,state,years,UN,college,fastfood)
# simulate cost outcome as a function of controls
df$cost <- (df$UN*.5 + df$college*.25 + .75*df$fastfood)*1000
# modify KY cost representing an intervention from 1995-98
df$cost <- ifelse(df$state =='KY' & df$years >= 1995,df$cost*.5,df$cost)
# modify a few states to look more like KY
df$cost<- ifelse(df$state == 'KY', df$cost,
ifelse(df$state == 'TN' & df$years < 1995,df$cost[df$state == 'KY'],
ifelse(df$state == 'GA' & df$years < 1995,df$cost[df$state == 'KY'],
ifelse(df$state == 'NC' & df$years < 1995,df$cost[df$state == 'KY'],
df$cost))))
#----------------------------------------------
# create synthetic controls for treatment (KY)
#----------------------------------------------
### Methodology/Estimation
# Synthetic control methods involve the construction of synthetic control units as convex
# combinations of multiple control units.
# To construct our synthetic control unit we define a vector of weights W
# Each W then represents one particular weighted average of control units and therefore one potential synthetic control unit.
# To create the most similar synthetic control unit, the synth() function chooses the vector W∗
# to minimize a distance, kX1 − X0Wk, between X1 and X0W, subject to the weight constraints.
# The V matrix is introduced to allow different weights to the variables in X0 and X1 depending
# on their predictive power on the outcome. An optimal choice of V assigns weights that
# minimize the mean square error of the synthetic control estimator, that is the expectation of
# (Y1 − Y0W∗)'(Y1 − Y0W∗)
# In this procedure a V∗ is chosen among all positive definite and diagonal matrices such that
# the mean squared prediction error (MSPE) of the outcome variable is minimized over some
# set of pre-intervention periods.
# IMPORTANT NOTES ABOUT THE REQ DATA STRUCTURE
# dataset is organized in standard (long) panel-data format, with variables extending
# across the columns and the rows sorted first by region and then by time-period.10 A name
# (character-string) and number is provided for each region.11 At least one of these two types
# of unit-identifiers is required for Synth
# The first step is to reorganize the panel dataset into an appropriate format that is suitable
# for the main estimator function synth(). At a minimum, synth() requires as inputs the four
# data matrices X1, X0, Z1, and Z0 that are needed to construct a synthetic control unit.
library(Synth) # load Synth package
df$state = as.character(df$state) # Synth requires our unit name variable to be character format
# run Synth's data prep function
dataprep.out=
dataprep(
foo = df,
predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units
predictors.op = "mean", # predictors will be aggregated via averaging
dependent = "cost", # our outcome variable
unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
time.variable = "years", # our panel data is tracked by year in our file
# special.predictors lets you control the time periods for certain predictor variables
# some measures are only available for certain years
# special.predictors = list(
# list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
# list("UN", 1990:1995, "mean") , # use theese year values for X1 in the pre period as a 'matching' variable
# list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
# list("fastfood", 1990:1995, "mean") # use theese year values for X3 in the pre period as a 'matching' variable
# ),
treatment.identifier = 7, # indicates KY is our treatment group
controls.identifier = c(1:6,8:20), # these states are part of our control pool which will be weighted to create synthetic controls
time.predictors.prior = c(1990:1994), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
time.optimize.ssr = c(1990:1994), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
unit.names.variable = "state", # character string identifying the column with the names of the units. This variable has to be of mode character.
time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
)
print(dataprep.out$X0) # check
print(dataprep.out$X1) # check
print(dataprep.out$Z1) # KY outcomes for the pre-intervention period
print(dataprep.out$Z0) # control outcomes for the pre-intervention period
synth.out = synth(dataprep.out) # create synthetic controls
#-----------------------------------
# some descriptive results
#------------------------------------
# pre built tables from synth objects
synth.tables <- synth.tab(dataprep.res = dataprep.out,synth.res = synth.out)
# comparing pre-treatment predictor values for the treated unit, the synthetic control unit, and all the units in the sample
synth.tables$tab.pred[1:3, ] # check balance across treated and control for pre-period predictors
# view control unit weights
synth.tables$tab.w
# calculate gaps or differencesin treatment and synthetic control
gaps <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)
gaps[6:9, 1] # intervention years
gaps[1:5, 1] # pre period years
#---------------------------------
# visualize outcome trend
#---------------------------------
# plot treatment vs synthetic control outcomes trend
path.plot(synth.res = synth.out, dataprep.res = dataprep.out,
Ylab = "made up outcome", Xlab = "year",
Legend = c("KY","Synthetic KY"),
Legend.position = "bottomright")
#------------------------------------
# visualize impact (gaps)
#------------------------------------
# gap plot
gaps.plot(synth.res = synth.out, dataprep.res = dataprep.out,
Ylab = "costs", Xlab = "year",
Ylim = c(-2000, 1000), Main = NA)
#-----------------------------------------------
# inference via placebo plot
#------------------------------------------------
# These tests involve applying the synthetic control method after reassigning the intervention in the
# data to units and periods where the intervention did not occur
# Abadie et al. (2010) describe how synthetic control methods facilitate inferential techniques
# akin to permutation tests-so-called placebo studies - iteratively apply the synthetic
# control method by randomly reassigning the intervention in time (i.e., pre-intervention dates)
# or across units (i.e., to control units where the intervention did not occur) to produce a set of
# placebo effects. Subsequently, we can compare the set of placebo effects to the effect that was
# estimated for the time and unit where the intervention actually occurred. This comparison
# is informative about the rarity of the magnitude of the treatment effect that was observed
# for the exposed unit.
# intialize matrices to stor results
store <- matrix(NA,length(1990:1998),20) # we will loop through all years and states
colnames(store) <- unique(df$state)
# run placebo test
for(iter in 1:20)
{
dataprep.out=
dataprep(
foo = df,
predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units
predictors.op = "mean", # predictors will be aggregated via averaging
dependent = "cost", # our outcome variable
unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
time.variable = "years", # our panel data is tracked by year in our file
# special.predictors lets you control the time periods for certain predictor variables
# in addition to the predictors to create our synthetic control units (some variables may only have measures for certain years)
# special.predictors = list(
# list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
# list("UN", 1990:1995, "mean") , # use theese year values for X1 in the pre period as a 'matching' variable
# list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
# list("fastfood", 1990:1995, "mean") # use theese year values for X3 in the pre period as a 'matching' variable
# ),
treatment.identifier = iter, # this is the iterated placebo treatment
controls.identifier = c(1:20)[-iter],# exclude this iteration's 'placebo treatment', remaining units will be controls
time.predictors.prior = c(1990:1995), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
time.optimize.ssr = c(1990:1995), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
unit.names.variable = "state", # character string identifying the column with the names of the units. This variable has to be of mode character.
time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
)
# run synth
synth.out <- synth(
data.prep.obj = dataprep.out,
method = "BFGS"
)
# store gaps
store[,iter] <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)
}
#------------------------------------------
# visualization
#------------------------------------------
data <- store
rownames(data) <- 1990:1998
# Set bounds in gaps data
gap.start <- 1
gap.end <- nrow(data)
years <- 1990:1998
gap.end.pre <- which(rownames(data)=="1994")
# MSPE Pre-Treatment
mse <- apply(data[gap.start:gap.end.pre,]^2,2,mean)
ky.mse <- as.numeric(mse[7]) # save MSPE for treatment
# Exclude states with 5 times higher MSPE than KY (treatment) (I have seen other multiples used like 2x)
data <- data[, mse < 5*ky.mse]
Cex.set <- .95
# Plot
plot(years,data[gap.start:gap.end,which(colnames(data)=="KY")],
ylim=c(-2000,2000),xlab="year",
xlim=c(1990,1998),ylab="gap in costs",
type="l",lwd=2,col="black",
xaxs="i",yaxs="i")
# Add lines for control states
for (i in 1:ncol(data)) { lines(years,data[gap.start:gap.end,i],col="gray") }
## Add treatment Line
lines(years,data[gap.start:gap.end,which(colnames(data)=="KY")],lwd=2,col="black")
# Add grid
legend("bottomright",legend=c("treatment","control regions"),
lty=c(1,1),col=c("black","gray"),lwd=c(2,1),cex=.8)
#arrows(1995,-1.5,1995.5,-1.5,col="black",length=.1)
text(1994.75,-2.5,"Test Start",cex=Cex.set)
abline(v=1994)
abline(v=1998)
#-----------------------------------
# calculate exact p-values based on ranked mspe
#-----------------------------------
# extract and transform gaps data / convert rownames to years
tmp <- data.frame(data)
years <- rownames(tmp)
rownames(tmp) <- NULL
tmp <- cbind(years,tmp)
tmp$years <- as.numeric((tmp$years))
head(tmp)
names(tmp)
# calculaute pre treatmnet mspe
pre <- tmp[tmp$years <= 1994,]
pre <- pre[,!(names(pre) %in% c("years"))]
mspe_pre <- apply(pre[1:5,]^2,2,mean) # this squares the gaps and averages for each state
# calculate post treatment mspe
post <- tmp[tmp$years > 1994,]
post <- post[,!(names(post) %in% c("years"))]
mspe_post <- apply(post[1:4,]^2,2,mean) # this squares the gaps and averages for each state
# calculate the ratio of ppst pre mspe
tmp2 <- data.frame(mspe_post / mspe_pre)
# transform
states <- rownames(tmp2)
rownames(tmp2) <- NULL
tmp2 <- cbind(states,tmp2)
head(tmp2)
names(tmp2)
#sort
tmp2 <- tmp2[order(-tmp2$mspe_post.mspe_pre),]
tmp2$rank <- seq(1:NROW(tmp2))
# calculate exact p-value based on our treatment unit's ranked mspe
tmp2$rank[tmp2$states == "KY"]/NROW(tmp2)
#
# use SCTools to calculate p-value from randomization inference
#
#!!!!!!!!!!!!! NOTE: run this after original data prep function not the placebo runs above
## Test how extreme was the observed treatment effect given the placebos:
library(SCtools)
tdf <- generate.placebos(dataprep.out,synth.out, Sigf.ipop = 2)
ratio <- mspe.test(tdf)
ratio$p.val
mspe.plot(tdf, discard.extreme = TRUE)
#---------------------------------------------------------
# synthtetic controls example from: https://mixtape.scunning.com/synthetic-control.html
#---------------------------------------------------------
library(tidyverse)
library(haven)
library(Synth)
library(devtools)
if(!require(SCtools)) devtools::install_github("bcastanho/SCtools")
library(SCtools)
read_data <- function(df)
{
full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/",
df, sep = "")
df <- read_dta(full_path)
return(df)
}
texas <- read_data("texas.dta") %>%
as.data.frame(.)
dataprep_out <- dataprep(
foo = texas,
predictors = c("poverty", "income"),
predictors.op = "mean",
time.predictors.prior = 1985:1993,
special.predictors = list(
list("bmprison", c(1988, 1990:1992), "mean"),
list("alcohol", 1990, "mean"),
list("aidscapita", 1990:1991, "mean"),
list("black", 1990:1992, "mean"),
list("perc1519", 1990, "mean")),
dependent = "bmprison",
unit.variable = "statefip",
unit.names.variable = "state",
time.variable = "year",
treatment.identifier = 48,
controls.identifier = c(1,2,4:6,8:13,15:42,44:47,49:51,53:56),
time.optimize.ssr = 1985:1993,
time.plot = 1985:2000
)
synth_out <- synth(data.prep.obj = dataprep_out)
#
# visualization - trend and gaps
#
path.plot(synth_out, dataprep_out)
gaps.plot(synth_out, dataprep_out)
#
# visualization - placebo plots
#
placebos <- generate.placebos(dataprep_out, synth_out, Sigf.ipop = 3)
plot_placebos(placebos)
mspe.plot(placebos, discard.extreme = TRUE, mspe.limit = 1, plot.hist = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment