BioSciEconomist · February 17, 2021 02:09
diff --git a/ex state level synthetic controls.R b/ex state level synthetic controls.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex state level synthetic controls.R
 # | DATE: 2/12/21 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:            
 # *----------------------------------------------------------------
 # | PURPOSE: This code builds some intuition for how synthetic control methods work
 # | using simulated data synthetic controls are created for a treatment state
 # | placebo plots visualize uncertainty/rarity of the treatment 
 # *----------------------------------------------------------------

 # Code based on Synth: An R Package for Synthetic Control Methods in Comparative Case Studies
 # Journal of Statistical Software June 2011, Volume 42, Issue 13

 # See also: https://cran.r-project.org/web/packages/Synth/Synth.pdf

 # additionally following this simulation see example code from: https://mixtape.scunning.com/synthetic-control.html

 rm(list=ls()) # get rid of any existing data 

 options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

 #-------------------------------------------
 # simulate our state panel data
 #-------------------------------------------

 years <- c(rep(1990:1998,20))

 ID <- c(rep(1,9),rep(2,9),rep(3,9),rep(4,9),rep(5,9),rep(6,9),rep(7,9),rep(8,9),
        rep(9,9),rep(10,9),rep(11,9),rep(12,9),rep(13,9),rep(14,9),rep(15,9),
        rep(16,9),rep(17,9),rep(18,9),rep(19,9),rep(20,9))
 state <- c(rep("CA",9),rep("CO",9),rep("CT",9),rep("DW",9),rep("FL",9),
           rep("TN",9),rep("KY",9),rep("GA",9),rep("NC",9),rep("SC",9),rep("LA",9),rep("TX",9),
           rep("AR",9),rep("IL",9),rep("IN",9),rep("MO",9),rep("IA",9),rep("OK",9),rep("NE",9),
           rep("AL",9))
 df <- data.frame(ID,state,years)

 #------------------------------------
 # create some random covariates
 #------------------------------------

 # unemployment 

 ky <- c(runif(9,.03,.05)) 
 ca <- c(runif(9,.02,.04))
 co <- c(runif(9,.03,.04))
 ct <- c(runif(9,.025,.03)) 
 dw <- c(runif(9,.03,.04)) 
 fl <- c(runif(9,.03,.035)) 
 tn <- c(runif(9,.03,.05)) 
 ga <- c(runif(9,.03,.05)) 
 nc <- c(runif(9,.03,.05)) 
 sc <- c(runif(9,.03,.05)) 
 la <- c(runif(9,.04,.055)) 
 tx <- c(runif(9,.02,.03)) 
 ar <- c(runif(9,.05,.06)) 
 il <- c(runif(9,.025,.03)) 
 ind <- c(runif(9,.03,.05)) 
 mo <- c(runif(9,.03,.035)) 
 ia <- c(runif(9,.03,.035)) 
 ok <- c(runif(9,.03,.05)) 
 ne <- c(runif(9,.03,.045)) 
 al <- c(runif(9,.04,.055)) 

 UN <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)

 # education % w/BS degree

 ky <- c(runif(9,.21,.24)) 
 ca <- c(runif(9,.24,.26))
 co <- c(runif(9,.22,.25))
 ct <- c(runif(9,.28,.33)) 
 dw <- c(runif(9,.27,.30)) 
 fl <- c(runif(9,.23,.24)) 
 tn <- c(runif(9,.21,.24)) 
 ga <- c(runif(9,.21,.24)) 
 nc <- c(runif(9,.21,.24)) 
 sc <- c(runif(9,.21,.23)) 
 la <- c(runif(9,.13,.15)) 
 tx <- c(runif(9,.20,.23)) 
 ar <- c(runif(9,.15,.18)) 
 il <- c(runif(9,.27,.29)) 
 ind <- c(runif(9,.26,.28)) 
 mo <- c(runif(9,.18,.21)) 
 ia <- c(runif(9,.20,.21)) 
 ok <- c(runif(9,.21,.22)) 
 ne <- c(runif(9,.21,.22)) 
 al <- c(runif(9,.15,.17)) 

 college <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)

 # restuarants per capita

 ky <- c(runif(9,4,5)) 
 ca <- c(runif(9,2,3))
 co <- c(runif(9,3,4))
 ct <- c(runif(9,2,3)) 
 dw <- c(runif(9,3,4)) 
 fl <- c(runif(9,5,6)) 
 tn <- c(runif(9,4,5)) 
 ga <- c(runif(9,4,5)) 
 nc <- c(runif(9,4,5)) 
 sc <- c(runif(9,3.5,4)) 
 la <- c(runif(9,6,7)) 
 tx <- c(runif(9,5,6)) 
 ar <- c(runif(9,3,4)) 
 il <- c(runif(9,2,3)) 
 ind <- c(runif(9,2,3)) 
 mo <- c(runif(9,4,5)) 
 ia <- c(runif(9,3,4)) 
 ok <- c(runif(9,2,3)) 
 ne <- c(runif(9,2,3)) 
 al <- c(runif(9,5,6)) 

 fastfood <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)

 df <- data.frame(ID,state,years,UN,college,fastfood)

 # simulate cost outcome as a function of controls
 df$cost <- (df$UN*.5 + df$college*.25 + .75*df$fastfood)*1000

 # modify KY cost representing an intervention from 1995-98
 df$cost <- ifelse(df$state =='KY' & df$years >= 1995,df$cost*.5,df$cost)

 # modify a few states to look more like KY
 df$cost<- ifelse(df$state == 'KY', df$cost,
                     ifelse(df$state == 'TN' & df$years < 1995,df$cost[df$state == 'KY'],
                            ifelse(df$state == 'GA' & df$years < 1995,df$cost[df$state == 'KY'],
                                   ifelse(df$state == 'NC' & df$years < 1995,df$cost[df$state == 'KY'],
                                    df$cost))))


 #----------------------------------------------
 # create synthetic controls for treatment (KY)
 #----------------------------------------------


 ### Methodology/Estimation

 # Synthetic control methods involve the construction of synthetic control units as convex 
 # combinations of multiple control units.

 #  To construct our synthetic control unit we define a vector of weights W
 #  Each W then represents one particular weighted average of control units and therefore one potential synthetic control unit.

 # To create the most similar synthetic control unit, the synth() function chooses the vector W∗
 # to minimize a distance, kX1 − X0Wk, between X1 and X0W, subject to the weight constraints.

 # The V matrix is introduced to allow different weights to the variables in X0 and X1 depending
 # on their predictive power on the outcome. An optimal choice of V assigns weights that
 # minimize the mean square error of the synthetic control estimator, that is the expectation of
 # (Y1 − Y0W∗)'(Y1 − Y0W∗)

 # In this procedure a V∗ is chosen among all positive definite and diagonal matrices such that
 # the mean squared prediction error (MSPE) of the outcome variable is minimized over some
 # set of pre-intervention periods.

 # IMPORTANT NOTES ABOUT THE REQ DATA STRUCTURE

 # dataset is organized in standard (long) panel-data format, with variables extending
 # across the columns and the rows sorted first by region and then by time-period.10 A name
 # (character-string) and number is provided for each region.11 At least one of these two types
 # of unit-identifiers is required for Synth

 # The first step is to reorganize the panel dataset into an appropriate format that is suitable
 # for the main estimator function synth(). At a minimum, synth() requires as inputs the four
 # data matrices X1, X0, Z1, and Z0 that are needed to construct a synthetic control unit. 


 library(Synth) # load Synth package

 df$state = as.character(df$state) # Synth requires our unit name variable to be character format 

 # run Synth's data prep function
 dataprep.out=
  dataprep(
    foo = df,
    predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units
    predictors.op = "mean",  # predictors will be aggregated via averaging
    dependent = "cost", # our outcome variable
    unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
    time.variable = "years", # our panel data is tracked by year in our file
    # special.predictors lets you control the time periods for certain predictor variables
    # some measures are only available for certain years 
    # special.predictors = list(
    #   list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
    #   list("UN", 1990:1995, "mean") ,  # use theese year values for X1 in the pre period as a 'matching' variable
    #   list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
    #   list("fastfood", 1990:1995, "mean")  # use theese year values for X3 in the pre period as a 'matching' variable
    # ),
    treatment.identifier = 7, # indicates KY is our treatment group
    controls.identifier = c(1:6,8:20), # these states are part of our control pool which will be weighted to create synthetic controls
    time.predictors.prior = c(1990:1994), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
    time.optimize.ssr = c(1990:1994), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
    unit.names.variable = "state", #  character string identifying the column with the names of the units. This variable has to be of mode character.
    time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
  )


 print(dataprep.out$X0) # check 
 print(dataprep.out$X1) # check 
 print(dataprep.out$Z1) # KY outcomes for the pre-intervention period
 print(dataprep.out$Z0) # control outcomes for the pre-intervention period

 synth.out = synth(dataprep.out) # create synthetic controls 

 #-----------------------------------
 # some descriptive results
 #------------------------------------


 # pre built tables from synth objects
 synth.tables <- synth.tab(dataprep.res = dataprep.out,synth.res = synth.out)

 # comparing pre-treatment predictor values for the treated unit, the synthetic control unit, and all the units in the sample
 synth.tables$tab.pred[1:3, ] # check balance across treated and control for pre-period predictors

 # view control unit weights
 synth.tables$tab.w

 # calculate gaps or differencesin treatment and synthetic control
 gaps <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)

 gaps[6:9, 1] # intervention years
 gaps[1:5, 1] # pre period years

 #---------------------------------
 # visualize outcome trend
 #---------------------------------

 # plot treatment vs synthetic control outcomes trend
 path.plot(synth.res = synth.out, dataprep.res = dataprep.out,
          Ylab = "made up outcome", Xlab = "year",
          Legend = c("KY","Synthetic KY"), 
          Legend.position = "bottomright")


 #------------------------------------
 # visualize impact (gaps)
 #------------------------------------

 # gap plot
 gaps.plot(synth.res = synth.out, dataprep.res = dataprep.out,
          Ylab = "costs", Xlab = "year",
          Ylim = c(-2000, 1000), Main = NA)


 #-----------------------------------------------
 # inference via placebo plot
 #------------------------------------------------

 # These tests involve applying the synthetic control method after reassigning the intervention in the
 # data to units and periods where the intervention did not occur


 # Abadie et al. (2010) describe how synthetic control methods facilitate inferential techniques
 # akin to permutation tests-so-called placebo studies - iteratively apply the synthetic
 # control method by randomly reassigning the intervention in time (i.e., pre-intervention dates)
 # or across units (i.e., to control units where the intervention did not occur) to produce a set of
 # placebo effects. Subsequently, we can compare the set of placebo effects to the effect that was
 # estimated for the time and unit where the intervention actually occurred. This comparison
 # is informative about the rarity of the magnitude of the treatment effect that was observed
 # for the exposed unit.

 # intialize matrices to stor results
 store <- matrix(NA,length(1990:1998),20) # we will loop through all years and states
 colnames(store) <- unique(df$state)


 # run placebo test
 for(iter in 1:20)
 {
  dataprep.out=
    dataprep(
      foo = df,
      predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units
      predictors.op = "mean",  # predictors will be aggregated via averaging
      dependent = "cost", # our outcome variable
      unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
      time.variable = "years", # our panel data is tracked by year in our file
      # special.predictors lets you control the time periods for certain predictor variables
      # in addition to the predictors to create our synthetic control units (some variables may only have measures for certain years)
      # special.predictors = list(
      #   list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
      #   list("UN", 1990:1995, "mean") ,  # use theese year values for X1 in the pre period as a 'matching' variable
      #   list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
      #   list("fastfood", 1990:1995, "mean")  # use theese year values for X3 in the pre period as a 'matching' variable
      # ),
      treatment.identifier = iter, # this is the iterated placebo treatment 
      controls.identifier = c(1:20)[-iter],# exclude this iteration's 'placebo treatment', remaining units will be controls
      time.predictors.prior = c(1990:1995), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
      time.optimize.ssr = c(1990:1995), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
      unit.names.variable = "state", #  character string identifying the column with the names of the units. This variable has to be of mode character.
      time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
    )
  # run synth
  synth.out <- synth(
    data.prep.obj = dataprep.out,
    method = "BFGS"
  )
  
  # store gaps
  store[,iter] <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)
 }

 #------------------------------------------
 # visualization 
 #------------------------------------------

 data <- store
 rownames(data) <- 1990:1998

 # Set bounds in gaps data
 gap.start     <- 1
 gap.end       <- nrow(data)
 years         <- 1990:1998
 gap.end.pre  <- which(rownames(data)=="1994")

 #  MSPE Pre-Treatment
 mse <- apply(data[gap.start:gap.end.pre,]^2,2,mean)
 ky.mse <- as.numeric(mse[7]) # save MSPE for treatment 

 # Exclude states with 5 times higher MSPE than KY (treatment) (I have seen other multiples used like 2x)
 data <- data[, mse < 5*ky.mse]
 Cex.set <- .95

 # Plot
 plot(years,data[gap.start:gap.end,which(colnames(data)=="KY")],
     ylim=c(-2000,2000),xlab="year",
     xlim=c(1990,1998),ylab="gap in costs",
     type="l",lwd=2,col="black",
     xaxs="i",yaxs="i")

 # Add lines for control states
 for (i in 1:ncol(data)) { lines(years,data[gap.start:gap.end,i],col="gray") }

 ## Add treatment Line
 lines(years,data[gap.start:gap.end,which(colnames(data)=="KY")],lwd=2,col="black")

 # Add grid

 legend("bottomright",legend=c("treatment","control regions"),
       lty=c(1,1),col=c("black","gray"),lwd=c(2,1),cex=.8)
 #arrows(1995,-1.5,1995.5,-1.5,col="black",length=.1)
 text(1994.75,-2.5,"Test Start",cex=Cex.set)
 abline(v=1994)
 abline(v=1998)

 #-----------------------------------
 # calculate exact p-values based on ranked mspe
 #-----------------------------------

 # extract and transform gaps data / convert rownames to years
 tmp <- data.frame(data)
 years <- rownames(tmp)
 rownames(tmp) <- NULL
 tmp <- cbind(years,tmp)
 tmp$years <- as.numeric((tmp$years))
 head(tmp)
 names(tmp)

 # calculaute pre treatmnet mspe
 pre <- tmp[tmp$years <= 1994,]
 pre <- pre[,!(names(pre) %in% c("years"))]
 mspe_pre <- apply(pre[1:5,]^2,2,mean) # this squares the gaps and averages for each state

 # calculate post treatment mspe
 post <- tmp[tmp$years > 1994,]
 post <- post[,!(names(post) %in% c("years"))]
 mspe_post <- apply(post[1:4,]^2,2,mean) # this squares the gaps and averages for each state

 # calculate the ratio of ppst pre mspe
 tmp2 <- data.frame(mspe_post / mspe_pre)

 # transform
 states <- rownames(tmp2)
 rownames(tmp2) <- NULL
 tmp2 <- cbind(states,tmp2)
 head(tmp2)
 names(tmp2)

 #sort 
 tmp2 <- tmp2[order(-tmp2$mspe_post.mspe_pre),]
 tmp2$rank <- seq(1:NROW(tmp2))

 # calculate exact p-value based on our treatment unit's ranked mspe
 tmp2$rank[tmp2$states == "KY"]/NROW(tmp2)


 #
 # use SCTools to calculate p-value from randomization inference
 #

 #!!!!!!!!!!!!! NOTE: run this after original data prep function not the placebo runs above

 ## Test how extreme was the observed treatment effect given the placebos:


 library(SCtools)

 tdf <- generate.placebos(dataprep.out,synth.out, Sigf.ipop = 2)

 ratio <- mspe.test(tdf)
 ratio$p.val

 mspe.plot(tdf, discard.extreme = TRUE)


 #---------------------------------------------------------
 #  synthtetic controls example from: https://mixtape.scunning.com/synthetic-control.html
 #---------------------------------------------------------

 library(tidyverse)
 library(haven)
 library(Synth)
 library(devtools)
 if(!require(SCtools)) devtools::install_github("bcastanho/SCtools")
 library(SCtools)

 read_data <- function(df)
 {
  full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/", 
                     df, sep = "")
  df <- read_dta(full_path)
  return(df)
 }

 texas <- read_data("texas.dta") %>%
  as.data.frame(.)

 dataprep_out <- dataprep(
  foo = texas,
  predictors = c("poverty", "income"),
  predictors.op = "mean",
  time.predictors.prior = 1985:1993,
  special.predictors = list(
    list("bmprison", c(1988, 1990:1992), "mean"),
    list("alcohol", 1990, "mean"),
    list("aidscapita", 1990:1991, "mean"),
    list("black", 1990:1992, "mean"),
    list("perc1519", 1990, "mean")),
  dependent = "bmprison",
  unit.variable = "statefip",
  unit.names.variable = "state",
  time.variable = "year",
  treatment.identifier = 48,
  controls.identifier = c(1,2,4:6,8:13,15:42,44:47,49:51,53:56),
  time.optimize.ssr = 1985:1993,
  time.plot = 1985:2000
 )

 synth_out <- synth(data.prep.obj = dataprep_out)


 #
 # visualization  - trend and gaps
 #

 path.plot(synth_out, dataprep_out)

 gaps.plot(synth_out, dataprep_out)

 #
 # visualization - placebo plots
 # 

 placebos <- generate.placebos(dataprep_out, synth_out, Sigf.ipop = 3)

 plot_placebos(placebos)

 mspe.plot(placebos, discard.extreme = TRUE, mspe.limit = 1, plot.hist = TRUE)
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex state level synthetic controls.R
	# \| DATE: 2/12/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: This code builds some intuition for how synthetic control methods work
	# \| using simulated data synthetic controls are created for a treatment state
	# \| placebo plots visualize uncertainty/rarity of the treatment
	# *----------------------------------------------------------------

	# Code based on Synth: An R Package for Synthetic Control Methods in Comparative Case Studies
	# Journal of Statistical Software June 2011, Volume 42, Issue 13

	# See also: https://cran.r-project.org/web/packages/Synth/Synth.pdf

	# additionally following this simulation see example code from: https://mixtape.scunning.com/synthetic-control.html

	rm(list=ls()) # get rid of any existing data

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	#-------------------------------------------
	# simulate our state panel data
	#-------------------------------------------

	years <- c(rep(1990:1998,20))

	ID <- c(rep(1,9),rep(2,9),rep(3,9),rep(4,9),rep(5,9),rep(6,9),rep(7,9),rep(8,9),
	rep(9,9),rep(10,9),rep(11,9),rep(12,9),rep(13,9),rep(14,9),rep(15,9),
	rep(16,9),rep(17,9),rep(18,9),rep(19,9),rep(20,9))
	state <- c(rep("CA",9),rep("CO",9),rep("CT",9),rep("DW",9),rep("FL",9),
	rep("TN",9),rep("KY",9),rep("GA",9),rep("NC",9),rep("SC",9),rep("LA",9),rep("TX",9),
	rep("AR",9),rep("IL",9),rep("IN",9),rep("MO",9),rep("IA",9),rep("OK",9),rep("NE",9),
	rep("AL",9))
	df <- data.frame(ID,state,years)

	#------------------------------------
	# create some random covariates
	#------------------------------------

	# unemployment

	ky <- c(runif(9,.03,.05))
	ca <- c(runif(9,.02,.04))
	co <- c(runif(9,.03,.04))
	ct <- c(runif(9,.025,.03))
	dw <- c(runif(9,.03,.04))
	fl <- c(runif(9,.03,.035))
	tn <- c(runif(9,.03,.05))
	ga <- c(runif(9,.03,.05))
	nc <- c(runif(9,.03,.05))
	sc <- c(runif(9,.03,.05))
	la <- c(runif(9,.04,.055))
	tx <- c(runif(9,.02,.03))
	ar <- c(runif(9,.05,.06))
	il <- c(runif(9,.025,.03))
	ind <- c(runif(9,.03,.05))
	mo <- c(runif(9,.03,.035))
	ia <- c(runif(9,.03,.035))
	ok <- c(runif(9,.03,.05))
	ne <- c(runif(9,.03,.045))
	al <- c(runif(9,.04,.055))

	UN <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)

	# education % w/BS degree

	ky <- c(runif(9,.21,.24))
	ca <- c(runif(9,.24,.26))
	co <- c(runif(9,.22,.25))
	ct <- c(runif(9,.28,.33))
	dw <- c(runif(9,.27,.30))
	fl <- c(runif(9,.23,.24))
	tn <- c(runif(9,.21,.24))
	ga <- c(runif(9,.21,.24))
	nc <- c(runif(9,.21,.24))
	sc <- c(runif(9,.21,.23))
	la <- c(runif(9,.13,.15))
	tx <- c(runif(9,.20,.23))
	ar <- c(runif(9,.15,.18))
	il <- c(runif(9,.27,.29))
	ind <- c(runif(9,.26,.28))
	mo <- c(runif(9,.18,.21))
	ia <- c(runif(9,.20,.21))
	ok <- c(runif(9,.21,.22))
	ne <- c(runif(9,.21,.22))
	al <- c(runif(9,.15,.17))

	college <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)

	# restuarants per capita

	ky <- c(runif(9,4,5))
	ca <- c(runif(9,2,3))
	co <- c(runif(9,3,4))
	ct <- c(runif(9,2,3))
	dw <- c(runif(9,3,4))
	fl <- c(runif(9,5,6))
	tn <- c(runif(9,4,5))
	ga <- c(runif(9,4,5))
	nc <- c(runif(9,4,5))
	sc <- c(runif(9,3.5,4))
	la <- c(runif(9,6,7))
	tx <- c(runif(9,5,6))
	ar <- c(runif(9,3,4))
	il <- c(runif(9,2,3))
	ind <- c(runif(9,2,3))
	mo <- c(runif(9,4,5))
	ia <- c(runif(9,3,4))
	ok <- c(runif(9,2,3))
	ne <- c(runif(9,2,3))
	al <- c(runif(9,5,6))

	fastfood <- c(ca,co,ct,dw,fl,tn,ky,ga,nc,sc,la,tx,ar,il,ind,mo,ia,ok,ne,al)

	df <- data.frame(ID,state,years,UN,college,fastfood)

	# simulate cost outcome as a function of controls
	df$cost <- (df$UN.5 + df$college.25 + .75df$fastfood)1000

	# modify KY cost representing an intervention from 1995-98
	df$cost <- ifelse(df$state =='KY' & df$years >= 1995,df$cost*.5,df$cost)

	# modify a few states to look more like KY
	df$cost<- ifelse(df$state == 'KY', df$cost,
	ifelse(df$state == 'TN' & df$years < 1995,df$cost[df$state == 'KY'],
	ifelse(df$state == 'GA' & df$years < 1995,df$cost[df$state == 'KY'],
	ifelse(df$state == 'NC' & df$years < 1995,df$cost[df$state == 'KY'],
	df$cost))))


	#----------------------------------------------
	# create synthetic controls for treatment (KY)
	#----------------------------------------------


	### Methodology/Estimation

	# Synthetic control methods involve the construction of synthetic control units as convex
	# combinations of multiple control units.

	# To construct our synthetic control unit we define a vector of weights W
	# Each W then represents one particular weighted average of control units and therefore one potential synthetic control unit.

	# To create the most similar synthetic control unit, the synth() function chooses the vector W∗
	# to minimize a distance, kX1 − X0Wk, between X1 and X0W, subject to the weight constraints.

	# The V matrix is introduced to allow different weights to the variables in X0 and X1 depending
	# on their predictive power on the outcome. An optimal choice of V assigns weights that
	# minimize the mean square error of the synthetic control estimator, that is the expectation of
	# (Y1 − Y0W∗)'(Y1 − Y0W∗)

	# In this procedure a V∗ is chosen among all positive definite and diagonal matrices such that
	# the mean squared prediction error (MSPE) of the outcome variable is minimized over some
	# set of pre-intervention periods.

	# IMPORTANT NOTES ABOUT THE REQ DATA STRUCTURE

	# dataset is organized in standard (long) panel-data format, with variables extending
	# across the columns and the rows sorted first by region and then by time-period.10 A name
	# (character-string) and number is provided for each region.11 At least one of these two types
	# of unit-identifiers is required for Synth

	# The first step is to reorganize the panel dataset into an appropriate format that is suitable
	# for the main estimator function synth(). At a minimum, synth() requires as inputs the four
	# data matrices X1, X0, Z1, and Z0 that are needed to construct a synthetic control unit.


	library(Synth) # load Synth package

	df$state = as.character(df$state) # Synth requires our unit name variable to be character format

	# run Synth's data prep function
	dataprep.out=
	dataprep(
	foo = df,
	predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units
	predictors.op = "mean", # predictors will be aggregated via averaging
	dependent = "cost", # our outcome variable
	unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
	time.variable = "years", # our panel data is tracked by year in our file
	# special.predictors lets you control the time periods for certain predictor variables
	# some measures are only available for certain years
	# special.predictors = list(
	# list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
	# list("UN", 1990:1995, "mean") , # use theese year values for X1 in the pre period as a 'matching' variable
	# list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
	# list("fastfood", 1990:1995, "mean") # use theese year values for X3 in the pre period as a 'matching' variable
	# ),
	treatment.identifier = 7, # indicates KY is our treatment group
	controls.identifier = c(1:6,8:20), # these states are part of our control pool which will be weighted to create synthetic controls
	time.predictors.prior = c(1990:1994), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
	time.optimize.ssr = c(1990:1994), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
	unit.names.variable = "state", # character string identifying the column with the names of the units. This variable has to be of mode character.
	time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
	)


	print(dataprep.out$X0) # check
	print(dataprep.out$X1) # check
	print(dataprep.out$Z1) # KY outcomes for the pre-intervention period
	print(dataprep.out$Z0) # control outcomes for the pre-intervention period

	synth.out = synth(dataprep.out) # create synthetic controls

	#-----------------------------------
	# some descriptive results
	#------------------------------------


	# pre built tables from synth objects
	synth.tables <- synth.tab(dataprep.res = dataprep.out,synth.res = synth.out)

	# comparing pre-treatment predictor values for the treated unit, the synthetic control unit, and all the units in the sample
	synth.tables$tab.pred[1:3, ] # check balance across treated and control for pre-period predictors

	# view control unit weights
	synth.tables$tab.w

	# calculate gaps or differencesin treatment and synthetic control
	gaps <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)

	gaps[6:9, 1] # intervention years
	gaps[1:5, 1] # pre period years

	#---------------------------------
	# visualize outcome trend
	#---------------------------------

	# plot treatment vs synthetic control outcomes trend
	path.plot(synth.res = synth.out, dataprep.res = dataprep.out,
	Ylab = "made up outcome", Xlab = "year",
	Legend = c("KY","Synthetic KY"),
	Legend.position = "bottomright")


	#------------------------------------
	# visualize impact (gaps)
	#------------------------------------

	# gap plot
	gaps.plot(synth.res = synth.out, dataprep.res = dataprep.out,
	Ylab = "costs", Xlab = "year",
	Ylim = c(-2000, 1000), Main = NA)


	#-----------------------------------------------
	# inference via placebo plot
	#------------------------------------------------

	# These tests involve applying the synthetic control method after reassigning the intervention in the
	# data to units and periods where the intervention did not occur


	# Abadie et al. (2010) describe how synthetic control methods facilitate inferential techniques
	# akin to permutation tests-so-called placebo studies - iteratively apply the synthetic
	# control method by randomly reassigning the intervention in time (i.e., pre-intervention dates)
	# or across units (i.e., to control units where the intervention did not occur) to produce a set of
	# placebo effects. Subsequently, we can compare the set of placebo effects to the effect that was
	# estimated for the time and unit where the intervention actually occurred. This comparison
	# is informative about the rarity of the magnitude of the treatment effect that was observed
	# for the exposed unit.

	# intialize matrices to stor results
	store <- matrix(NA,length(1990:1998),20) # we will loop through all years and states
	colnames(store) <- unique(df$state)


	# run placebo test
	for(iter in 1:20)
	{
	dataprep.out=
	dataprep(
	foo = df,
	predictors = c("UN", "college", "fastfood"), # predictors we are using to create controls units
	predictors.op = "mean", # predictors will be aggregated via averaging
	dependent = "cost", # our outcome variable
	unit.variable = "ID", # this identifies the units of aggregation or level of analysis (state level)
	time.variable = "years", # our panel data is tracked by year in our file
	# special.predictors lets you control the time periods for certain predictor variables
	# in addition to the predictors to create our synthetic control units (some variables may only have measures for certain years)
	# special.predictors = list(
	# list("cost", 1990:1995, "mean") , # use theese year values for Y in the pre period as a 'matching' variable
	# list("UN", 1990:1995, "mean") , # use theese year values for X1 in the pre period as a 'matching' variable
	# list("college", 1990:1995, "mean"), # use theese year values for X2 in the pre period as a 'matching' variable
	# list("fastfood", 1990:1995, "mean") # use theese year values for X3 in the pre period as a 'matching' variable
	# ),
	treatment.identifier = iter, # this is the iterated placebo treatment
	controls.identifier = c(1:20)[-iter],# exclude this iteration's 'placebo treatment', remaining units will be controls
	time.predictors.prior = c(1990:1995), # numeric vector identifying the pretreatment periods over which the values for the outcome predictors should be averaged
	time.optimize.ssr = c(1990:1995), # A numeric vector identifying the periods of the dependent variable over which the loss function should be minimized
	unit.names.variable = "state", # character string identifying the column with the names of the units. This variable has to be of mode character.
	time.plot = 1990:1998 # vector identifying the periods over which results are to be plotted
	)
	# run synth
	synth.out <- synth(
	data.prep.obj = dataprep.out,
	method = "BFGS"
	)

	# store gaps
	store[,iter] <- dataprep.out$Y1plot - (dataprep.out$Y0plot %*% synth.out$solution.w)
	}

	#------------------------------------------
	# visualization
	#------------------------------------------

	data <- store
	rownames(data) <- 1990:1998

	# Set bounds in gaps data
	gap.start <- 1
	gap.end <- nrow(data)
	years <- 1990:1998
	gap.end.pre <- which(rownames(data)=="1994")

	# MSPE Pre-Treatment
	mse <- apply(data[gap.start:gap.end.pre,]^2,2,mean)
	ky.mse <- as.numeric(mse[7]) # save MSPE for treatment

	# Exclude states with 5 times higher MSPE than KY (treatment) (I have seen other multiples used like 2x)
	data <- data[, mse < 5*ky.mse]
	Cex.set <- .95

	# Plot
	plot(years,data[gap.start:gap.end,which(colnames(data)=="KY")],
	ylim=c(-2000,2000),xlab="year",
	xlim=c(1990,1998),ylab="gap in costs",
	type="l",lwd=2,col="black",
	xaxs="i",yaxs="i")

	# Add lines for control states
	for (i in 1:ncol(data)) { lines(years,data[gap.start:gap.end,i],col="gray") }

	## Add treatment Line
	lines(years,data[gap.start:gap.end,which(colnames(data)=="KY")],lwd=2,col="black")

	# Add grid

	legend("bottomright",legend=c("treatment","control regions"),
	lty=c(1,1),col=c("black","gray"),lwd=c(2,1),cex=.8)
	#arrows(1995,-1.5,1995.5,-1.5,col="black",length=.1)
	text(1994.75,-2.5,"Test Start",cex=Cex.set)
	abline(v=1994)
	abline(v=1998)

	#-----------------------------------
	# calculate exact p-values based on ranked mspe
	#-----------------------------------

	# extract and transform gaps data / convert rownames to years
	tmp <- data.frame(data)
	years <- rownames(tmp)
	rownames(tmp) <- NULL
	tmp <- cbind(years,tmp)
	tmp$years <- as.numeric((tmp$years))
	head(tmp)
	names(tmp)

	# calculaute pre treatmnet mspe
	pre <- tmp[tmp$years <= 1994,]
	pre <- pre[,!(names(pre) %in% c("years"))]
	mspe_pre <- apply(pre[1:5,]^2,2,mean) # this squares the gaps and averages for each state

	# calculate post treatment mspe
	post <- tmp[tmp$years > 1994,]
	post <- post[,!(names(post) %in% c("years"))]
	mspe_post <- apply(post[1:4,]^2,2,mean) # this squares the gaps and averages for each state

	# calculate the ratio of ppst pre mspe
	tmp2 <- data.frame(mspe_post / mspe_pre)

	# transform
	states <- rownames(tmp2)
	rownames(tmp2) <- NULL
	tmp2 <- cbind(states,tmp2)
	head(tmp2)
	names(tmp2)

	#sort
	tmp2 <- tmp2[order(-tmp2$mspe_post.mspe_pre),]
	tmp2$rank <- seq(1:NROW(tmp2))

	# calculate exact p-value based on our treatment unit's ranked mspe
	tmp2$rank[tmp2$states == "KY"]/NROW(tmp2)


	#
	# use SCTools to calculate p-value from randomization inference
	#

	#!!!!!!!!!!!!! NOTE: run this after original data prep function not the placebo runs above

	## Test how extreme was the observed treatment effect given the placebos:


	library(SCtools)

	tdf <- generate.placebos(dataprep.out,synth.out, Sigf.ipop = 2)

	ratio <- mspe.test(tdf)
	ratio$p.val

	mspe.plot(tdf, discard.extreme = TRUE)


	#---------------------------------------------------------
	# synthtetic controls example from: https://mixtape.scunning.com/synthetic-control.html
	#---------------------------------------------------------

	library(tidyverse)
	library(haven)
	library(Synth)
	library(devtools)
	if(!require(SCtools)) devtools::install_github("bcastanho/SCtools")
	library(SCtools)

	read_data <- function(df)
	{
	full_path <- paste("https://raw.github.com/scunning1975/mixtape/master/",
	df, sep = "")
	df <- read_dta(full_path)
	return(df)
	}

	texas <- read_data("texas.dta") %>%
	as.data.frame(.)

	dataprep_out <- dataprep(
	foo = texas,
	predictors = c("poverty", "income"),
	predictors.op = "mean",
	time.predictors.prior = 1985:1993,
	special.predictors = list(
	list("bmprison", c(1988, 1990:1992), "mean"),
	list("alcohol", 1990, "mean"),
	list("aidscapita", 1990:1991, "mean"),
	list("black", 1990:1992, "mean"),
	list("perc1519", 1990, "mean")),
	dependent = "bmprison",
	unit.variable = "statefip",
	unit.names.variable = "state",
	time.variable = "year",
	treatment.identifier = 48,
	controls.identifier = c(1,2,4:6,8:13,15:42,44:47,49:51,53:56),
	time.optimize.ssr = 1985:1993,
	time.plot = 1985:2000
	)

	synth_out <- synth(data.prep.obj = dataprep_out)


	#
	# visualization - trend and gaps
	#

	path.plot(synth_out, dataprep_out)

	gaps.plot(synth_out, dataprep_out)

	#
	# visualization - placebo plots
	#

	placebos <- generate.placebos(dataprep_out, synth_out, Sigf.ipop = 3)

	plot_placebos(placebos)

	mspe.plot(placebos, discard.extreme = TRUE, mspe.limit = 1, plot.hist = TRUE)