Created
December 31, 2020 23:11
-
-
Save BioSciEconomist/711148fa4a62f4c3519e2d772c6c0e6c to your computer and use it in GitHub Desktop.
Demonstrate basic concepts related to to selection bias, regression, and instrumental variables
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: Regression_and_IV.py | |
# | DATE: 12/31/20 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: demonstrate basic concepts related to to selection bias, regression, and instrumental variables | |
# *---------------------------------------------------------------- | |
import pandas as pd | |
import numpy as np | |
# | |
# basic utilities | |
# | |
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation | |
# generate some data | |
data = {'treat' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0], | |
'lifestyle':['H','H','H','H','H','H','H','N','N','N','N','N','N','N'], | |
'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2], | |
'calories':[2000,2000,3000,3000,2000,2000,2000,3000,3000,5000,5000,5000,3000,3000], | |
'push':[1,0,0,1,1,0,0,1,1,0,0,1,0,0], | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['treat','lifestyle','wtchg','calories','push']) | |
print(df) | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
# raw comparisons | |
results = smf.ols('wtchg ~ treat', data=df).fit() | |
results.params | |
# hypothetical - what if we could control for lifestyle? | |
results = smf.ols('wtchg ~ treat + lifestyle', data=df).fit() | |
results.params | |
# regression controlling for historical calorie intake | |
results = smf.ols('wtchg ~ treat + calories', data=df).fit() | |
results.params | |
# | |
# manual IV estimation | |
# | |
stage1 = smf.ols('treat ~ push', data=df).fit() | |
stage1.params | |
stage2 = smf.ols('wtchg ~ push', data=df).fit() | |
stage2.params | |
stage2.params[1]/stage1.params[1] # estimate our treatment effect | |
# | |
# using the IV2SLS estimator from statsmodels | |
# | |
from statsmodels.sandbox.regression.gmm import IV2SLS | |
# fit IV2SLS model | |
# syntax: IV2SLS(endogenous, exogenous, instrument) | |
# we can think of this as: IV2SLS(outcome, treatment, insgrument) | |
# note: when stats models runs these models under the hood it expects a constant term that we have to add to get the correct result | |
resultIV = IV2SLS(df['wtchg'], sm.add_constant(df['treat']), sm.add_constant(df['push'])).fit() | |
resultIV.summary() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment