BioSciEconomist · February 20, 2021 17:56
diff --git a/ex synthetic controls.py b/ex synthetic controls.py

 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex synthetic controls.py
 # | DATE: 2/13/21 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:            
 # *----------------------------------------------------------------
 # | PURPOSE: intuition for the application of synthetic control methods in python
 # *----------------------------------------------------------------

 # estimation and placebo plots and inference

 # this code is sourced from: https://github.com/OscarEngelbrektson/SyntheticControlMethods/blob/master/examples/german_reunification.ipynb

 # Replicating German Reunification study from Abadie, Diamond and Hainmueller (2015) 
 # This meant provide a worked example of how the package can be used, and how to 
 # interpret the different plots.

 # In this example, we replicate Abadie, Diamond and Hainmueller (2015) which estimates 
 # the economic impact of the 1990 German reunification on West Germany using the synthetic
 # control method. We perform the replication using both the ordinary Synthetic Control and 
 # the Differenced Synthetic Control implemented in this package.


 #Import packages
 import pandas as pd
 import numpy as np


 from SyntheticControlMethods import Synth

 # get data from: 
 # https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/german_reunification.csv
 df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/german_reunification.csv')

 df = df.drop(columns="code", axis=1)
 df.head()


 # Variable descriptions and data sources:

 # GDP per Capita (PPP, 2002 USD): Source: OECD National Accounts (retrieved via the OECD Health Database). Data for West Germany was obtained from Statistisches Bundesamt 2005 (Arbeitskreis “Volkswirtschaftliche Gesamtrechnungen der Lander”) and converted using PPP ¨monetary conversion factors (retrieved from the OECD Health Database).

 # Investment Rate: Ratio of real domestic investment (private plus public) to real GDP. The data are reported in five-year averages. Source: Barro, Robert Joseph, and Jong-wha Lee. 1994. “Data Set for a Panel of 138 Countries.” Available at http://www.nber.org/pub/barro.lee/.

 # Schooling: Percentage of secondary school attained in the total population aged 25 and older. The data are reported in five-year increments. Source: Barro, Robert Joseph, and Jong-wha Lee. 2000. “International Data on Educational Attainment: Updates and Implications.” CID Working Paper No. 42, April 2000 – Human Capital Updated Files.

 # Industry: industry share of value added. Source: World Bank WDI Database 2005 and Statistisches Bundesamt 2005.

 # Inflation: annual percentage change in consumer prices (base year 1995). Source: World Development Indicators

 # Database 2005 and Statistisches Bundesamt 2005. Trade Openness: Export plus imports as percentage of GDP. 
 # Source: World Bank: World Development Indicators CD-ROM 2000.


 df.describe()


 # Replication using Synthetic Control, Synth

 # Synthetic Control is fit using the Synth() method which takes the following inputs:

 # data: Type: Pandas dataframe. A pandas dataframe containing the dataset. Each row should contain one observation for a unit at a time, including the outcome and covariates. Dataset should be ordered by unit then time.

 # outcome_var: Type: str. Name of outcome column in data, e.g. "gdp"

 # id_var: Type: str. Name of unit indicator column in data, e.g. "country"

 # time_var: Type: str. Name of time column in data, e.g. "year"

 # treatment_period: Type: int. Time of first observation after the treatment took place, i.e. first observation affected by the treatment effect. E.g. 1990 for german reunification.

 # treated_unit: Type: str. Name of the unit that recieved treatment, data["id_var"] == treated_unit

 # n_optim: Type: int. Default: 10. Number of different initialization values for which the optimization is run. Higher number means longer runtime, but a higher change of a globally optimal solution.


 # Fit synthetic control
 #def __init__(data, outcome_var, unit_var, time_var, treatment_period, treated_unit)
 sc = Synth(df, "gdp", "country", "year", 1990, "West Germany")

 # Visualize
 sc.plot(["original", "pointwise", "cumulative"], treated_label="West Germany", 
            synth_label="Synthetic West Germany", treatment_label="German Reunification")


 # The plot contains three panels. The first panel, "original", shows the data and a counterfactual 
 # prediction for the post-treatment period. The second panel, "pointwise", shows the difference 
 # between observed data and counterfactual predictions. This is the pointwise causal effect, 
 # as estimated by the model. The third panel, "cumulative", adds up the pointwise contributions
 # from the second panel, resulting in a plot of the cumulative effect of the intervention.

 # RMSPE for Synthetic West Germany vs. West Germany
 # Treated unit is always first unit in rmspe_df
 sc.original_data.rmspe_df.iloc[0]


 #--------------------------------------------------
 # Validity testing of SC
 #--------------------------------------------------

 #"To evaluate the credibility of our results, we conduct placebo studies where the treatment of 
 # interest is reassigned in the data to a year other than 1990 or to countries different from 
 # West Germany." (Abadie et. al, 2015)

 #
 # In-time placebo
 #

 #"We first compare the reunification effect estimated above for West Germany to a placebo 
 # effect obtained after reassigning the German reunification in our data to a period before 
 # the reunification actually took place. A large placebo estimate would undermine our 
 # confidence that the results in Figure 2 are indeed indicative of the economic cost of 
 # reunification and not merely driven by lack of predictive power." (Abadie et. al, 2015)

 # Specifically, we call in_time_placebo(1982), which reruns the model with 1982 as the 
 # first observation after the treatment, about 8 years before the actual reunification in 1990.


 #In-time placebo
 #Placebo treatment period is 1982, 8 years earlier
 sc.in_time_placebo(1982, n_optim=10)

 #Visualize
 sc.plot(['in-time placebo'], 
            treated_label="West Germany",
            synth_label="Synthetic West Germany")

 # Observing the 'in-time placebo' plot, we see that the outcome of the synthetic control 
 # and West Germany do not diverge until the true treatment in 1990, remaining close thereuntil. 
 # This increases our confidence in the counterfactual outcome provided by the synthetic control.


 #
 # In-space placebo
 # 


 # An alternative way to conduct placebo studies is to reassign the treatment in 
 # the data to a comparison unit. In this way, we can obtain synthetic control 
 # estimates for countries that did not experience the event of interest. Applying 
 # this idea to each country in the donor pool allows us to compare the estimated 
 # effect of the German reunification on West Germany to the distribution of placebo 
 # effects obtained for other countries. We will deem the effect of the German 
 # reunification on West Germany significant if the estimated effect for West 
 # Germany is unusually large relative to the distribution of placebo effects.

 # To perform this in-space placebo study, we use the method in_space_placebo(). 
 # The results are best visualized using the 'rmspe ratio' plot, which shows the 
 # distribution of Post-treatment period RMSPE / Pre-treatment period RMSPE for 
 # the true treated unit and each of the placebo treated units. The logic is that 
 # in the presence of a large treatment effect, the post-treatment difference between 
 # the a unit and its synthetic counterpart would be large relative to the pre-treatment 
 # difference.

 # Compute in-space placebos
 sc.in_space_placebo()

 # placebo plot
 sc.plot(['in-space placebo'], in_space_exclusion_multiple=5, treated_label="West Germany",
            synth_label="Synthetic West Germany")

 # Visualize magnitude of rmspe
 sc.plot(['rmspe ratio'], treated_label="West Germany")

 # West Germany is a clear outlier in the Post-period / pre-period RMSPE distribution, 
 # considerably more extreme than any of the placebo treated units. This increases our 
 # confidence in the synthetic control estimates.

 # get calculated rmspe from palcebos and treatment unit
 tmp = sc.original_data.rmspe_df # copy rmspe data frame
 print(tmp)

 # calculate rank based exact p-value
 tmp["rank"] = list(range(1,18,1)) # add ranking variable (value = 18 will need to be modified depending on the lenght of your data frame)
 trtrank = tmp[tmp.unit == "West Germany"] # get data for treatment unit
 pval = trtrank["rank"][0]/len(tmp) # pvalue is relative rank among calculated placebo rmspe ratios
 print('p-value:',pval)

	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex synthetic controls.py
	# \| DATE: 2/13/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: intuition for the application of synthetic control methods in python
	# *----------------------------------------------------------------

	# estimation and placebo plots and inference

	# this code is sourced from: https://github.com/OscarEngelbrektson/SyntheticControlMethods/blob/master/examples/german_reunification.ipynb

	# Replicating German Reunification study from Abadie, Diamond and Hainmueller (2015)
	# This meant provide a worked example of how the package can be used, and how to
	# interpret the different plots.

	# In this example, we replicate Abadie, Diamond and Hainmueller (2015) which estimates
	# the economic impact of the 1990 German reunification on West Germany using the synthetic
	# control method. We perform the replication using both the ordinary Synthetic Control and
	# the Differenced Synthetic Control implemented in this package.


	#Import packages
	import pandas as pd
	import numpy as np


	from SyntheticControlMethods import Synth

	# get data from:
	# https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/german_reunification.csv
	df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/german_reunification.csv')

	df = df.drop(columns="code", axis=1)
	df.head()


	# Variable descriptions and data sources:

	# GDP per Capita (PPP, 2002 USD): Source: OECD National Accounts (retrieved via the OECD Health Database). Data for West Germany was obtained from Statistisches Bundesamt 2005 (Arbeitskreis “Volkswirtschaftliche Gesamtrechnungen der Lander”) and converted using PPP ¨monetary conversion factors (retrieved from the OECD Health Database).

	# Investment Rate: Ratio of real domestic investment (private plus public) to real GDP. The data are reported in five-year averages. Source: Barro, Robert Joseph, and Jong-wha Lee. 1994. “Data Set for a Panel of 138 Countries.” Available at http://www.nber.org/pub/barro.lee/.

	# Schooling: Percentage of secondary school attained in the total population aged 25 and older. The data are reported in five-year increments. Source: Barro, Robert Joseph, and Jong-wha Lee. 2000. “International Data on Educational Attainment: Updates and Implications.” CID Working Paper No. 42, April 2000 – Human Capital Updated Files.

	# Industry: industry share of value added. Source: World Bank WDI Database 2005 and Statistisches Bundesamt 2005.

	# Inflation: annual percentage change in consumer prices (base year 1995). Source: World Development Indicators

	# Database 2005 and Statistisches Bundesamt 2005. Trade Openness: Export plus imports as percentage of GDP.
	# Source: World Bank: World Development Indicators CD-ROM 2000.


	df.describe()


	# Replication using Synthetic Control, Synth

	# Synthetic Control is fit using the Synth() method which takes the following inputs:

	# data: Type: Pandas dataframe. A pandas dataframe containing the dataset. Each row should contain one observation for a unit at a time, including the outcome and covariates. Dataset should be ordered by unit then time.

	# outcome_var: Type: str. Name of outcome column in data, e.g. "gdp"

	# id_var: Type: str. Name of unit indicator column in data, e.g. "country"

	# time_var: Type: str. Name of time column in data, e.g. "year"

	# treatment_period: Type: int. Time of first observation after the treatment took place, i.e. first observation affected by the treatment effect. E.g. 1990 for german reunification.

	# treated_unit: Type: str. Name of the unit that recieved treatment, data["id_var"] == treated_unit

	# n_optim: Type: int. Default: 10. Number of different initialization values for which the optimization is run. Higher number means longer runtime, but a higher change of a globally optimal solution.


	# Fit synthetic control
	#def __init__(data, outcome_var, unit_var, time_var, treatment_period, treated_unit)
	sc = Synth(df, "gdp", "country", "year", 1990, "West Germany")

	# Visualize
	sc.plot(["original", "pointwise", "cumulative"], treated_label="West Germany",
	synth_label="Synthetic West Germany", treatment_label="German Reunification")


	# The plot contains three panels. The first panel, "original", shows the data and a counterfactual
	# prediction for the post-treatment period. The second panel, "pointwise", shows the difference
	# between observed data and counterfactual predictions. This is the pointwise causal effect,
	# as estimated by the model. The third panel, "cumulative", adds up the pointwise contributions
	# from the second panel, resulting in a plot of the cumulative effect of the intervention.

	# RMSPE for Synthetic West Germany vs. West Germany
	# Treated unit is always first unit in rmspe_df
	sc.original_data.rmspe_df.iloc[0]


	#--------------------------------------------------
	# Validity testing of SC
	#--------------------------------------------------

	#"To evaluate the credibility of our results, we conduct placebo studies where the treatment of
	# interest is reassigned in the data to a year other than 1990 or to countries different from
	# West Germany." (Abadie et. al, 2015)

	#
	# In-time placebo
	#

	#"We first compare the reunification effect estimated above for West Germany to a placebo
	# effect obtained after reassigning the German reunification in our data to a period before
	# the reunification actually took place. A large placebo estimate would undermine our
	# confidence that the results in Figure 2 are indeed indicative of the economic cost of
	# reunification and not merely driven by lack of predictive power." (Abadie et. al, 2015)

	# Specifically, we call in_time_placebo(1982), which reruns the model with 1982 as the
	# first observation after the treatment, about 8 years before the actual reunification in 1990.


	#In-time placebo
	#Placebo treatment period is 1982, 8 years earlier
	sc.in_time_placebo(1982, n_optim=10)

	#Visualize
	sc.plot(['in-time placebo'],
	treated_label="West Germany",
	synth_label="Synthetic West Germany")

	# Observing the 'in-time placebo' plot, we see that the outcome of the synthetic control
	# and West Germany do not diverge until the true treatment in 1990, remaining close thereuntil.
	# This increases our confidence in the counterfactual outcome provided by the synthetic control.


	#
	# In-space placebo
	#


	# An alternative way to conduct placebo studies is to reassign the treatment in
	# the data to a comparison unit. In this way, we can obtain synthetic control
	# estimates for countries that did not experience the event of interest. Applying
	# this idea to each country in the donor pool allows us to compare the estimated
	# effect of the German reunification on West Germany to the distribution of placebo
	# effects obtained for other countries. We will deem the effect of the German
	# reunification on West Germany significant if the estimated effect for West
	# Germany is unusually large relative to the distribution of placebo effects.

	# To perform this in-space placebo study, we use the method in_space_placebo().
	# The results are best visualized using the 'rmspe ratio' plot, which shows the
	# distribution of Post-treatment period RMSPE / Pre-treatment period RMSPE for
	# the true treated unit and each of the placebo treated units. The logic is that
	# in the presence of a large treatment effect, the post-treatment difference between
	# the a unit and its synthetic counterpart would be large relative to the pre-treatment
	# difference.

	# Compute in-space placebos
	sc.in_space_placebo()

	# placebo plot
	sc.plot(['in-space placebo'], in_space_exclusion_multiple=5, treated_label="West Germany",
	synth_label="Synthetic West Germany")

	# Visualize magnitude of rmspe
	sc.plot(['rmspe ratio'], treated_label="West Germany")

	# West Germany is a clear outlier in the Post-period / pre-period RMSPE distribution,
	# considerably more extreme than any of the placebo treated units. This increases our
	# confidence in the synthetic control estimates.

	# get calculated rmspe from palcebos and treatment unit
	tmp = sc.original_data.rmspe_df # copy rmspe data frame
	print(tmp)

	# calculate rank based exact p-value
	tmp["rank"] = list(range(1,18,1)) # add ranking variable (value = 18 will need to be modified depending on the lenght of your data frame)
	trtrank = tmp[tmp.unit == "West Germany"] # get data for treatment unit
	pval = trtrank["rank"][0]/len(tmp) # pvalue is relative rank among calculated placebo rmspe ratios
	print('p-value:',pval)