BioSciEconomist · November 14, 2021 22:35
diff --git a/ex basic econometrics.py b/ex basic econometrics.py

 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex basic econometrics.py
 # | DATE: 11/14/21
 # | CREATED BY: MATT BOGARD
 # | PROJECT FILE:
 # *----------------------------------------------------------------
 # | PURPOSE: example calculate ME for LOGIT vs LPM VIFs and robust SEs
 # *----------------------------------------------------------------

 # see also: ex basic econometrics.R https://gist.github.com/BioSciEconomist/b674d323b867b0da4483dfe97ed117ea

 import pandas as pd
 import numpy as np


 import statsmodels.api as sm # import stastmodels
 import statsmodels.formula.api as smf # this allows us to use an explicit formulation


 # read lalonde data

 df = pd.read_csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

 df['ID'] = range(0,614) # create ID

 df.head(100) # check
 df.columns # check
 df.info() # check
 df.describe() # check 

 tmp = df[['re74','re75']] # check additional fields
 tmp.describe()

 # basic regression lpm
 results = smf.ols('treat ~  age + nodegree + re74 + re75', data=df).fit()
 results.summary2() 


 # logit model 

 model = smf.logit(formula= 'treat ~   age + nodegree + re74 + re75', data=df).fit()
 model.summary()


 #
 # marginal effects using stats models functions
 #

 print(model.get_margeff(at ='overall').summary()) # get marginal effects
 print(model.get_margeff(at ='mean').summary()) # get marginal effects

 # home grown marginal effects (using equivalent glm model)


 model = smf.glm('treat ~   age + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
 result = model.fit()
 result.summary()

 mu1 = round(df.age.mean())
 mu2 = round(df.nodegree.mean())
 mu3 = round(df.re74.mean())
 mu4 = round(df.re75.mean())

 def mfx(result,mu1,mu2,mu3,mu4,par):
    """
    result: model object from stats models logistic regression
    ex: y ~ b0 + b1*x1 + b2*x2
    mu1: mean value for first variable in model
    mu2: mean value for 2nd variable in model
    par: indicates index from 0 for model parameter you want to convert to a 
         marginal effect
    note: this easily extends to more variables but does not handle predictors
          with multiple categories (unless they are dummy coded)
    """
    b0 =  result.params[0]  
    b1 =  result.params[1] 
    b2 =  result.params[2] 
    b3 =  result.params[3] 
    b4 =  result.params[4] 
    XB = mu1*b1 + mu2*b2 + mu3*b3 +mu4*b4+ b0 
    return (np.exp(XB)/((1+np.exp(XB))**2))*result.params[par]

 # calculate marginal effets for each variable (matches at the mean ME using smf)
 mfx(result,mu1,mu2,mu3,mu4,1) # age
 mfx(result,mu1,mu2,mu3,mu4,2) # nodegree
 mfx(result,mu1,mu2,mu3,mu4,3) # re74
 mfx(result,mu1,mu2,mu3,mu4,4) # re75                   

 #
 # VIFs
 #

 def variance_inflation_factor(exog, exog_idx):
    """
    exog : ndarray, (nobs, k_vars)
        design matrix with all explanatory variables, as for example used in
        regression
    exog_idx : int
        index of the exogenous variable in the columns of exog
    """
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    r_squared_i = sm.OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    return vif


 tmp = df[['age','nodegree','re74','re75']]

 X = sm.add_constant(tmp)

 pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

 # compare to R vif function from car library

 # library(car)

 # df = read.csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

 # vif(lm(treat ~ age + nodegree + re74 + re75, data=df))

 #     age nodegree     re74     re75 
 #   1.128    1.049    1.654    1.447 




 # in order to include categoricals we have to recode

 tmp = pd.get_dummies(df, columns=['race'])

 tmp = tmp[['age','race_black','race_hispan','nodegree','re74','re75']]


 X = sm.add_constant(tmp)

 pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)



 #
 # robust SEs
 #

 # basic regression lpm
 results = smf.ols('treat ~  age + nodegree + re74 + re75', data=df).fit()
 results.summary2() 

 results = smf.ols('treat ~  age + nodegree + re74 + re75', data=df).fit(cov_type = 'HC1')
 results.summary2() 

 # similar results to R below:
 #                Estimate   Std. Error t value       Pr(>|t|)    
 #(Intercept)  0.369393334  0.055311541    6.68 0.000000000055 ***
 #age         -0.001014507  0.001802206   -0.56           0.57    
 #nodegree     0.053119443  0.036936833    1.44           0.15    
 #re74        -0.000016601  0.000003369   -4.93 0.000001071947 ***
 #re75         0.000000845  0.000006455    0.13           0.90

	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex basic econometrics.py
	# \| DATE: 11/14/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: example calculate ME for LOGIT vs LPM VIFs and robust SEs
	# *----------------------------------------------------------------

	# see also: ex basic econometrics.R https://gist.github.com/BioSciEconomist/b674d323b867b0da4483dfe97ed117ea

	import pandas as pd
	import numpy as np


	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation


	# read lalonde data

	df = pd.read_csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

	df['ID'] = range(0,614) # create ID

	df.head(100) # check
	df.columns # check
	df.info() # check
	df.describe() # check

	tmp = df[['re74','re75']] # check additional fields
	tmp.describe()

	# basic regression lpm
	results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit()
	results.summary2()


	# logit model

	model = smf.logit(formula= 'treat ~ age + nodegree + re74 + re75', data=df).fit()
	model.summary()


	#
	# marginal effects using stats models functions
	#

	print(model.get_margeff(at ='overall').summary()) # get marginal effects
	print(model.get_margeff(at ='mean').summary()) # get marginal effects

	# home grown marginal effects (using equivalent glm model)


	model = smf.glm('treat ~ age + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
	result = model.fit()
	result.summary()

	mu1 = round(df.age.mean())
	mu2 = round(df.nodegree.mean())
	mu3 = round(df.re74.mean())
	mu4 = round(df.re75.mean())

	def mfx(result,mu1,mu2,mu3,mu4,par):
	"""
	result: model object from stats models logistic regression
	ex: y ~ b0 + b1x1 + b2x2
	mu1: mean value for first variable in model
	mu2: mean value for 2nd variable in model
	par: indicates index from 0 for model parameter you want to convert to a
	marginal effect
	note: this easily extends to more variables but does not handle predictors
	with multiple categories (unless they are dummy coded)
	"""
	b0 = result.params[0]
	b1 = result.params[1]
	b2 = result.params[2]
	b3 = result.params[3]
	b4 = result.params[4]
	XB = mu1b1 + mu2b2 + mu3b3 +mu4b4+ b0
	return (np.exp(XB)/((1+np.exp(XB))*2))result.params[par]

	# calculate marginal effets for each variable (matches at the mean ME using smf)
	mfx(result,mu1,mu2,mu3,mu4,1) # age
	mfx(result,mu1,mu2,mu3,mu4,2) # nodegree
	mfx(result,mu1,mu2,mu3,mu4,3) # re74
	mfx(result,mu1,mu2,mu3,mu4,4) # re75

	#
	# VIFs
	#

	def variance_inflation_factor(exog, exog_idx):
	"""
	exog : ndarray, (nobs, k_vars)
	design matrix with all explanatory variables, as for example used in
	regression
	exog_idx : int
	index of the exogenous variable in the columns of exog
	"""
	k_vars = exog.shape[1]
	x_i = exog[:, exog_idx]
	mask = np.arange(k_vars) != exog_idx
	x_noti = exog[:, mask]
	r_squared_i = sm.OLS(x_i, x_noti).fit().rsquared
	vif = 1. / (1. - r_squared_i)
	return vif


	tmp = df[['age','nodegree','re74','re75']]

	X = sm.add_constant(tmp)

	pd.Series([variance_inflation_factor(X.values, i)
	for i in range(X.shape[1])],
	index=X.columns)

	# compare to R vif function from car library

	# library(car)

	# df = read.csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

	# vif(lm(treat ~ age + nodegree + re74 + re75, data=df))

	# age nodegree re74 re75
	# 1.128 1.049 1.654 1.447




	# in order to include categoricals we have to recode

	tmp = pd.get_dummies(df, columns=['race'])

	tmp = tmp[['age','race_black','race_hispan','nodegree','re74','re75']]


	X = sm.add_constant(tmp)

	pd.Series([variance_inflation_factor(X.values, i)
	for i in range(X.shape[1])],
	index=X.columns)



	#
	# robust SEs
	#

	# basic regression lpm
	results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit()
	results.summary2()

	results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit(cov_type = 'HC1')
	results.summary2()

	# similar results to R below:
	# Estimate Std. Error t value Pr(>\|t\|)
	#(Intercept) 0.369393334 0.055311541 6.68 0.000000000055 ***
	#age -0.001014507 0.001802206 -0.56 0.57
	#nodegree 0.053119443 0.036936833 1.44 0.15
	#re74 -0.000016601 0.000003369 -4.93 0.000001071947 ***
	#re75 0.000000845 0.000006455 0.13 0.90