BioSciEconomist · December 31, 2020 23:11
diff --git a/Regression_and_IV.py b/Regression_and_IV.py

 # *-----------------------------------------------------------------
 # | PROGRAM NAME: Regression_and_IV.py
 # | DATE: 12/31/20
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:      
 # *----------------------------------------------------------------
 # | PURPOSE: demonstrate basic concepts related to to selection bias, regression, and instrumental variables
 # *----------------------------------------------------------------

 import pandas as pd
 import numpy as np

 #
 # basic utilities
 #

 pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation

 # generate some data
 data = {'treat' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
        'lifestyle':['H','H','H','H','H','H','H','N','N','N','N','N','N','N'],
 		'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
 		'calories':[2000,2000,3000,3000,2000,2000,2000,3000,3000,5000,5000,5000,3000,3000],
        'push':[1,0,0,1,1,0,0,1,1,0,0,1,0,0],
 }

 # convert to a data frame
 df = pd.DataFrame(data,columns=['treat','lifestyle','wtchg','calories','push'])
 print(df)

 import statsmodels.api as sm # import stastmodels
 import statsmodels.formula.api as smf # this allows us to use an explicit formulation
 
 # raw comparisons
 results = smf.ols('wtchg ~ treat', data=df).fit()
 results.params



 # hypothetical - what if we could control for lifestyle?

 results = smf.ols('wtchg ~ treat + lifestyle', data=df).fit()
 results.params


 # regression controlling for historical calorie intake
 results = smf.ols('wtchg ~ treat + calories', data=df).fit()
 results.params

 # 
 # manual IV estimation
 #


 stage1 = smf.ols('treat ~ push', data=df).fit()
 stage1.params


 stage2 = smf.ols('wtchg ~ push', data=df).fit()
 stage2.params

 stage2.params[1]/stage1.params[1] # estimate our treatment effect


 #
 # using the IV2SLS estimator from statsmodels
 #

 from statsmodels.sandbox.regression.gmm import IV2SLS 

 # fit IV2SLS model 
 # syntax: IV2SLS(endogenous, exogenous, instrument)
 # we can think of this as: IV2SLS(outcome, treatment, insgrument) 
 # note: when stats models runs these models under the hood it expects a constant term that we have to add to get the correct result
 resultIV = IV2SLS(df['wtchg'], sm.add_constant(df['treat']), sm.add_constant(df['push'])).fit()
 resultIV.summary()

	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: Regression_and_IV.py
	# \| DATE: 12/31/20
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: demonstrate basic concepts related to to selection bias, regression, and instrumental variables
	# *----------------------------------------------------------------

	import pandas as pd
	import numpy as np

	#
	# basic utilities
	#

	pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation

	# generate some data
	data = {'treat' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
	'lifestyle':['H','H','H','H','H','H','H','N','N','N','N','N','N','N'],
	'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
	'calories':[2000,2000,3000,3000,2000,2000,2000,3000,3000,5000,5000,5000,3000,3000],
	'push':[1,0,0,1,1,0,0,1,1,0,0,1,0,0],
	}

	# convert to a data frame
	df = pd.DataFrame(data,columns=['treat','lifestyle','wtchg','calories','push'])
	print(df)

	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation

	# raw comparisons
	results = smf.ols('wtchg ~ treat', data=df).fit()
	results.params



	# hypothetical - what if we could control for lifestyle?

	results = smf.ols('wtchg ~ treat + lifestyle', data=df).fit()
	results.params


	# regression controlling for historical calorie intake
	results = smf.ols('wtchg ~ treat + calories', data=df).fit()
	results.params

	#
	# manual IV estimation
	#


	stage1 = smf.ols('treat ~ push', data=df).fit()
	stage1.params


	stage2 = smf.ols('wtchg ~ push', data=df).fit()
	stage2.params

	stage2.params[1]/stage1.params[1] # estimate our treatment effect


	#
	# using the IV2SLS estimator from statsmodels
	#

	from statsmodels.sandbox.regression.gmm import IV2SLS

	# fit IV2SLS model
	# syntax: IV2SLS(endogenous, exogenous, instrument)
	# we can think of this as: IV2SLS(outcome, treatment, insgrument)
	# note: when stats models runs these models under the hood it expects a constant term that we have to add to get the correct result
	resultIV = IV2SLS(df['wtchg'], sm.add_constant(df['treat']), sm.add_constant(df['push'])).fit()
	resultIV.summary()