Last active
May 17, 2022 21:14
-
-
Save BioSciEconomist/b1e7308b1ffed12b4b5be4c7c030edf3 to your computer and use it in GitHub Desktop.
Back of the Envelope Difference in Differences Simulation and Power Analysis for a Linear Probability Model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: DID LPM Power.py | |
# | DATE: 1/6/22 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: simulate data mimicking hypothesized DID scenario and assess power (2x2 DID) | |
# *---------------------------------------------------------------- | |
# !!!!!! WARNING - UNDER CONSTRUCTION - THIS DOES NOT ACCOUNT FOR CORRELATION STRUCTURES | |
# AND RESAMPLING METHOD MAY NEED REVISION | |
# CONTACT [email protected] for feedback and suggestions to help improve this script | |
import pandas as pd | |
import numpy as np | |
from matplotlib import pyplot | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
import statsmodels | |
from statsmodels.stats import power | |
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation | |
np.set_printoptions(suppress=True) # suppress sci notation | |
#------------------------------------------ | |
# simulate data | |
#------------------------------------------ | |
T1_TRT = .85 # treatment pre average outcome | |
T2_TRT = .83 # treatment post average outcome | |
T1_CTRL = .80 # control pre average outcome | |
T2_CTRL = .64 # control post average outcome | |
(T1_TRT - T2_TRT) - (T1_CTRL-T2_CTRL) # population DID treatment effect | |
# treatment pre post | |
trtPre = pd.DataFrame(columns=['ID','prob','treat','time']) | |
trtPre['ID'] =list(range(1,100001)) | |
np.random.seed(123) | |
trtPre['prob'] = np.random.binomial(1, T1_TRT, 100000) | |
trtPre['treat'] = 1 | |
trtPre['time'] = 0 | |
trtPre['prob'].describe() | |
trtPost = pd.DataFrame(columns=['ID','prob','treat','time']) | |
trtPost['ID'] =list(range(1,100001)) | |
np.random.seed(123) | |
trtPost['prob'] = np.random.binomial(1,T2_TRT,100000) | |
trtPost['treat'] = 1 | |
trtPost['time'] = 1 | |
trtPost['prob'].describe() | |
# control pre post | |
ctrlPre = pd.DataFrame(columns=['ID','prob','treat','time']) | |
ctrlPre['ID'] =list(range(100001,200001)) | |
np.random.seed(123) | |
ctrlPre['prob'] = np.random.binomial(1,T1_CTRL,100000) | |
ctrlPre['treat'] = 0 | |
ctrlPre['time'] = 0 | |
ctrlPre['prob'].describe() | |
ctrlPost = pd.DataFrame(columns=['ID','prob','treat','time']) | |
ctrlPost['ID'] =list(range(100001,200001)) | |
np.random.seed(123) | |
ctrlPost['prob'] = np.random.binomial(1,T2_CTRL,100000) | |
ctrlPost['treat'] = 0 | |
ctrlPost['time'] = 1 | |
ctrlPost['prob'].describe() | |
# combine data | |
df = pd.concat([trtPre,trtPost,ctrlPre,ctrlPost,],ignore_index=True) | |
(trtPre.prob.mean()-trtPost.prob.mean()) - (ctrlPre.prob.mean() - ctrlPost.prob.mean()) | |
#---------------------------------------- | |
# estimate DID model | |
#---------------------------------------- | |
# Y = B0 + B1*Treat + B2*Post + B3*Treat*Post + e | |
# raw comparisons | |
results = smf.ols('prob ~ treat + time + treat*time', data=df).fit() # linear probability model | |
results.summary() | |
#------------------------------------ | |
# test on a sample | |
#----------------------------------- | |
nSize = 150 | |
np.random.seed(321) | |
sample_t = trtPre.sample(nSize, replace=False) # get 500 trt IDs | |
np.random.seed(321) | |
sample_c = ctrlPre.sample(nSize, replace=False) # get 500 ctrl IDs | |
ids = pd.concat([sample_t,sample_c,],ignore_index=True) | |
ids['rsample'] = 1 | |
# flag and subset randomly sampled IDs | |
rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left') | |
mask = (rs['rsample'] == 1) | |
rs = rs[mask] | |
# did estimate | |
results = smf.ols('prob ~ treat + time + treat*time', data=rs).fit() # linear probability model | |
results.summary() | |
#-------------------------------- | |
# power analysis - compute power for a given sample size | |
#-------------------------------- | |
# define power analysis function | |
nSize = 150 | |
def run_tests(trt_dat,ctrl_dat,size): | |
"""uses monte carlo type resampling of simulated data to construct power, trt_dat is simulated pre period data for the treatment group, ctrl_dat is simulated pre period data for the control group, size is the # of reps in the simualtion""" | |
# Create an empty array to store replicates | |
est = np.empty(size) | |
seeds = np.empty(size) | |
pvals = np.empty(size) | |
# pull bootstrapped samples | |
for i in range(size): | |
print (i) | |
# Create a bootstrap sample | |
np.random.seed(i) | |
sample_t = trt_dat.sample(nSize, replace=False) # get randome set of trt IDs | |
np.random.seed(i) | |
sample_c = ctrl_dat.sample(nSize, replace=False) # get random set of ctrl IDs | |
ids = pd.concat([sample_t,sample_c,],ignore_index=True) | |
ids['rsample'] = 1 | |
# flag and subset randomly sampled IDs | |
rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left') | |
mask = (rs['rsample'] == 1) | |
rs = rs[mask] | |
# fit model | |
model = smf.ols('prob ~ treat + time + treat*time', data=rs).fit() | |
est[i] = model.params['treat:time'] | |
pvals[i] = model.pvalues['treat:time'] | |
#print(tmp.y1.mean()) | |
seeds[i] = i | |
return est, seeds,pvals | |
# get simulated trt and control data | |
run_trt = trtPre | |
run_ctrl = ctrlPre | |
# estimate power using function defined above | |
results = run_tests(run_trt,run_ctrl,500) | |
report = pd.DataFrame(columns=['seed','est']) | |
report['seed'] = results[1] | |
report['est'] = results[0] | |
report['pvals'] = results[2] | |
report['sig'] = np.where(report['pvals'] < .10,1,0) # alpha = .10, change to desired level of confidence | |
# print(report) | |
sum(report['sig'])/len(report['sig']) # power | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment