BioSciEconomist · May 17, 2022 21:14
diff --git a/DID LPM Power.py b/DID LPM Power.py
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: DID LPM Power.py
 # | DATE: 1/6/22 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:            
 # *----------------------------------------------------------------
 # | PURPOSE: simulate data mimicking hypothesized DID scenario and assess power (2x2 DID)
 # *----------------------------------------------------------------

 # !!!!!! WARNING - UNDER CONSTRUCTION - THIS DOES NOT ACCOUNT FOR CORRELATION STRUCTURES 
 #         AND RESAMPLING METHOD MAY NEED REVISION

 # CONTACT [email protected] for feedback and suggestions to help improve this script
 
 import pandas as pd
 import numpy as np
 from matplotlib import pyplot
 import statsmodels.api as sm # import stastmodels
 import statsmodels.formula.api as smf # this allows us to use an explicit formulation
 
 import statsmodels
 from statsmodels.stats import power
 
 pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
 np.set_printoptions(suppress=True) # suppress sci notation
 
 #------------------------------------------
 # simulate data
 #------------------------------------------
 
 
 T1_TRT = .85 # treatment pre average outcome
 T2_TRT = .83 # treatment post average outcome
 
 T1_CTRL = .80 # control pre average outcome
 T2_CTRL = .64 # control post average outcome
 
 (T1_TRT - T2_TRT) - (T1_CTRL-T2_CTRL) # population DID treatment effect
 
 
 # treatment pre post
 
 trtPre = pd.DataFrame(columns=['ID','prob','treat','time'])
 trtPre['ID'] =list(range(1,100001))
 np.random.seed(123)
 trtPre['prob'] = np.random.binomial(1, T1_TRT, 100000)
 trtPre['treat'] = 1
 trtPre['time'] = 0
 
 trtPre['prob'].describe()
 
 
 trtPost = pd.DataFrame(columns=['ID','prob','treat','time'])
 trtPost['ID'] =list(range(1,100001))
 np.random.seed(123)
 trtPost['prob'] = np.random.binomial(1,T2_TRT,100000)
 trtPost['treat'] = 1
 trtPost['time'] = 1
 
 trtPost['prob'].describe()
 
 
 # control pre post
 
 
 ctrlPre = pd.DataFrame(columns=['ID','prob','treat','time'])
 ctrlPre['ID'] =list(range(100001,200001))
 np.random.seed(123)
 ctrlPre['prob'] = np.random.binomial(1,T1_CTRL,100000)
 ctrlPre['treat'] = 0
 ctrlPre['time'] = 0
 
 ctrlPre['prob'].describe()
 
 
 ctrlPost = pd.DataFrame(columns=['ID','prob','treat','time'])
 ctrlPost['ID'] =list(range(100001,200001))
 np.random.seed(123)
 ctrlPost['prob'] = np.random.binomial(1,T2_CTRL,100000)
 ctrlPost['treat'] = 0
 ctrlPost['time'] = 1
 
 ctrlPost['prob'].describe()
 
 
 # combine data
 
 df = pd.concat([trtPre,trtPost,ctrlPre,ctrlPost,],ignore_index=True)
 
 (trtPre.prob.mean()-trtPost.prob.mean()) - (ctrlPre.prob.mean() - ctrlPost.prob.mean())
 
 
 #----------------------------------------
 # estimate DID model
 #----------------------------------------
 
 
 # Y = B0 + B1*Treat + B2*Post  + B3*Treat*Post + e
 
 # raw comparisons
 results = smf.ols('prob ~ treat + time + treat*time', data=df).fit() # linear probability model
 
 results.summary()
 
 
 #------------------------------------
 # test on a sample
 #-----------------------------------
 
 nSize = 150
 
 
 np.random.seed(321)
 sample_t = trtPre.sample(nSize, replace=False) # get 500 trt IDs
 
 np.random.seed(321)
 sample_c = ctrlPre.sample(nSize, replace=False) # get 500 ctrl IDs
 
 ids = pd.concat([sample_t,sample_c,],ignore_index=True)
 ids['rsample'] = 1
 
 
 # flag and subset randomly sampled IDs
 rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
 mask = (rs['rsample'] == 1)
 rs = rs[mask]
 
 
 # did estimate
 results = smf.ols('prob ~ treat + time + treat*time', data=rs).fit() # linear probability model
 
 results.summary()
 

 #--------------------------------
 # power analysis - compute power for a given sample size
 #--------------------------------
 
 # define power analysis function 
 
 nSize = 150
 
 def run_tests(trt_dat,ctrl_dat,size):
    """uses monte carlo type resampling of simulated data to construct power, trt_dat is simulated pre period data for the treatment group, ctrl_dat is simulated pre period data for the control group, size is the # of reps in the simualtion"""
    # Create an empty array to store replicates
    est = np.empty(size)
    seeds = np.empty(size)
    pvals = np.empty(size)
    # pull bootstrapped samples
    for i in range(size):
        print (i)
        # Create a bootstrap sample
        np.random.seed(i)
        sample_t = trt_dat.sample(nSize, replace=False) # get randome set of trt IDs
        np.random.seed(i)
        sample_c = ctrl_dat.sample(nSize, replace=False) # get random set of ctrl IDs
       
        ids = pd.concat([sample_t,sample_c,],ignore_index=True)
        ids['rsample'] = 1
 
 
        # flag and subset randomly sampled IDs
        rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
        mask = (rs['rsample'] == 1)
        rs = rs[mask]
       
        # fit model
        model = smf.ols('prob ~ treat + time + treat*time', data=rs).fit()
        est[i] = model.params['treat:time']
        pvals[i] = model.pvalues['treat:time']
        #print(tmp.y1.mean())
        seeds[i] = i
 
    return est, seeds,pvals


 # get simulated trt and control data
 
 run_trt = trtPre
 run_ctrl = ctrlPre 
 
 # estimate power using function defined above

 results = run_tests(run_trt,run_ctrl,500)
 
 report = pd.DataFrame(columns=['seed','est'])
 report['seed'] = results[1]
 report['est'] = results[0]
 report['pvals'] = results[2]
 report['sig'] = np.where(report['pvals'] < .10,1,0) # alpha = .10, change to desired level of confidence
 # print(report)
 
 sum(report['sig'])/len(report['sig']) # power
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: DID LPM Power.py
	# \| DATE: 1/6/22
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: simulate data mimicking hypothesized DID scenario and assess power (2x2 DID)
	# *----------------------------------------------------------------

	# !!!!!! WARNING - UNDER CONSTRUCTION - THIS DOES NOT ACCOUNT FOR CORRELATION STRUCTURES
	# AND RESAMPLING METHOD MAY NEED REVISION

	# CONTACT [email protected] for feedback and suggestions to help improve this script

	import pandas as pd
	import numpy as np
	from matplotlib import pyplot
	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation

	import statsmodels
	from statsmodels.stats import power

	pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
	np.set_printoptions(suppress=True) # suppress sci notation

	#------------------------------------------
	# simulate data
	#------------------------------------------


	T1_TRT = .85 # treatment pre average outcome
	T2_TRT = .83 # treatment post average outcome

	T1_CTRL = .80 # control pre average outcome
	T2_CTRL = .64 # control post average outcome

	(T1_TRT - T2_TRT) - (T1_CTRL-T2_CTRL) # population DID treatment effect


	# treatment pre post

	trtPre = pd.DataFrame(columns=['ID','prob','treat','time'])
	trtPre['ID'] =list(range(1,100001))
	np.random.seed(123)
	trtPre['prob'] = np.random.binomial(1, T1_TRT, 100000)
	trtPre['treat'] = 1
	trtPre['time'] = 0

	trtPre['prob'].describe()


	trtPost = pd.DataFrame(columns=['ID','prob','treat','time'])
	trtPost['ID'] =list(range(1,100001))
	np.random.seed(123)
	trtPost['prob'] = np.random.binomial(1,T2_TRT,100000)
	trtPost['treat'] = 1
	trtPost['time'] = 1

	trtPost['prob'].describe()


	# control pre post


	ctrlPre = pd.DataFrame(columns=['ID','prob','treat','time'])
	ctrlPre['ID'] =list(range(100001,200001))
	np.random.seed(123)
	ctrlPre['prob'] = np.random.binomial(1,T1_CTRL,100000)
	ctrlPre['treat'] = 0
	ctrlPre['time'] = 0

	ctrlPre['prob'].describe()


	ctrlPost = pd.DataFrame(columns=['ID','prob','treat','time'])
	ctrlPost['ID'] =list(range(100001,200001))
	np.random.seed(123)
	ctrlPost['prob'] = np.random.binomial(1,T2_CTRL,100000)
	ctrlPost['treat'] = 0
	ctrlPost['time'] = 1

	ctrlPost['prob'].describe()


	# combine data

	df = pd.concat([trtPre,trtPost,ctrlPre,ctrlPost,],ignore_index=True)

	(trtPre.prob.mean()-trtPost.prob.mean()) - (ctrlPre.prob.mean() - ctrlPost.prob.mean())


	#----------------------------------------
	# estimate DID model
	#----------------------------------------


	# Y = B0 + B1Treat + B2Post + B3TreatPost + e

	# raw comparisons
	results = smf.ols('prob ~ treat + time + treat*time', data=df).fit() # linear probability model

	results.summary()


	#------------------------------------
	# test on a sample
	#-----------------------------------

	nSize = 150


	np.random.seed(321)
	sample_t = trtPre.sample(nSize, replace=False) # get 500 trt IDs

	np.random.seed(321)
	sample_c = ctrlPre.sample(nSize, replace=False) # get 500 ctrl IDs

	ids = pd.concat([sample_t,sample_c,],ignore_index=True)
	ids['rsample'] = 1


	# flag and subset randomly sampled IDs
	rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
	mask = (rs['rsample'] == 1)
	rs = rs[mask]


	# did estimate
	results = smf.ols('prob ~ treat + time + treat*time', data=rs).fit() # linear probability model

	results.summary()


	#--------------------------------
	# power analysis - compute power for a given sample size
	#--------------------------------

	# define power analysis function

	nSize = 150

	def run_tests(trt_dat,ctrl_dat,size):
	"""uses monte carlo type resampling of simulated data to construct power, trt_dat is simulated pre period data for the treatment group, ctrl_dat is simulated pre period data for the control group, size is the # of reps in the simualtion"""
	# Create an empty array to store replicates
	est = np.empty(size)
	seeds = np.empty(size)
	pvals = np.empty(size)
	# pull bootstrapped samples
	for i in range(size):
	print (i)
	# Create a bootstrap sample
	np.random.seed(i)
	sample_t = trt_dat.sample(nSize, replace=False) # get randome set of trt IDs
	np.random.seed(i)
	sample_c = ctrl_dat.sample(nSize, replace=False) # get random set of ctrl IDs

	ids = pd.concat([sample_t,sample_c,],ignore_index=True)
	ids['rsample'] = 1


	# flag and subset randomly sampled IDs
	rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
	mask = (rs['rsample'] == 1)
	rs = rs[mask]

	# fit model
	model = smf.ols('prob ~ treat + time + treat*time', data=rs).fit()
	est[i] = model.params['treat:time']
	pvals[i] = model.pvalues['treat:time']
	#print(tmp.y1.mean())
	seeds[i] = i

	return est, seeds,pvals


	# get simulated trt and control data

	run_trt = trtPre
	run_ctrl = ctrlPre

	# estimate power using function defined above

	results = run_tests(run_trt,run_ctrl,500)

	report = pd.DataFrame(columns=['seed','est'])
	report['seed'] = results[1]
	report['est'] = results[0]
	report['pvals'] = results[2]
	report['sig'] = np.where(report['pvals'] < .10,1,0) # alpha = .10, change to desired level of confidence
	# print(report)

	sum(report['sig'])/len(report['sig']) # power