Created
November 14, 2021 22:35
-
-
Save BioSciEconomist/18d8f7f5eacbf7107c5a72b036d37ea9 to your computer and use it in GitHub Desktop.
Example calculations of marginal effects for LOGIT vs LPM VIFs and robust SEs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex basic econometrics.py | |
# | DATE: 11/14/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: example calculate ME for LOGIT vs LPM VIFs and robust SEs | |
# *---------------------------------------------------------------- | |
# see also: ex basic econometrics.R https://gist.github.com/BioSciEconomist/b674d323b867b0da4483dfe97ed117ea | |
import pandas as pd | |
import numpy as np | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
# read lalonde data | |
df = pd.read_csv("/Users/mattbogard/Google Drive/Data/lalonde.csv") | |
df['ID'] = range(0,614) # create ID | |
df.head(100) # check | |
df.columns # check | |
df.info() # check | |
df.describe() # check | |
tmp = df[['re74','re75']] # check additional fields | |
tmp.describe() | |
# basic regression lpm | |
results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit() | |
results.summary2() | |
# logit model | |
model = smf.logit(formula= 'treat ~ age + nodegree + re74 + re75', data=df).fit() | |
model.summary() | |
# | |
# marginal effects using stats models functions | |
# | |
print(model.get_margeff(at ='overall').summary()) # get marginal effects | |
print(model.get_margeff(at ='mean').summary()) # get marginal effects | |
# home grown marginal effects (using equivalent glm model) | |
model = smf.glm('treat ~ age + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit)) | |
result = model.fit() | |
result.summary() | |
mu1 = round(df.age.mean()) | |
mu2 = round(df.nodegree.mean()) | |
mu3 = round(df.re74.mean()) | |
mu4 = round(df.re75.mean()) | |
def mfx(result,mu1,mu2,mu3,mu4,par): | |
""" | |
result: model object from stats models logistic regression | |
ex: y ~ b0 + b1*x1 + b2*x2 | |
mu1: mean value for first variable in model | |
mu2: mean value for 2nd variable in model | |
par: indicates index from 0 for model parameter you want to convert to a | |
marginal effect | |
note: this easily extends to more variables but does not handle predictors | |
with multiple categories (unless they are dummy coded) | |
""" | |
b0 = result.params[0] | |
b1 = result.params[1] | |
b2 = result.params[2] | |
b3 = result.params[3] | |
b4 = result.params[4] | |
XB = mu1*b1 + mu2*b2 + mu3*b3 +mu4*b4+ b0 | |
return (np.exp(XB)/((1+np.exp(XB))**2))*result.params[par] | |
# calculate marginal effets for each variable (matches at the mean ME using smf) | |
mfx(result,mu1,mu2,mu3,mu4,1) # age | |
mfx(result,mu1,mu2,mu3,mu4,2) # nodegree | |
mfx(result,mu1,mu2,mu3,mu4,3) # re74 | |
mfx(result,mu1,mu2,mu3,mu4,4) # re75 | |
# | |
# VIFs | |
# | |
def variance_inflation_factor(exog, exog_idx): | |
""" | |
exog : ndarray, (nobs, k_vars) | |
design matrix with all explanatory variables, as for example used in | |
regression | |
exog_idx : int | |
index of the exogenous variable in the columns of exog | |
""" | |
k_vars = exog.shape[1] | |
x_i = exog[:, exog_idx] | |
mask = np.arange(k_vars) != exog_idx | |
x_noti = exog[:, mask] | |
r_squared_i = sm.OLS(x_i, x_noti).fit().rsquared | |
vif = 1. / (1. - r_squared_i) | |
return vif | |
tmp = df[['age','nodegree','re74','re75']] | |
X = sm.add_constant(tmp) | |
pd.Series([variance_inflation_factor(X.values, i) | |
for i in range(X.shape[1])], | |
index=X.columns) | |
# compare to R vif function from car library | |
# library(car) | |
# df = read.csv("/Users/mattbogard/Google Drive/Data/lalonde.csv") | |
# vif(lm(treat ~ age + nodegree + re74 + re75, data=df)) | |
# age nodegree re74 re75 | |
# 1.128 1.049 1.654 1.447 | |
# in order to include categoricals we have to recode | |
tmp = pd.get_dummies(df, columns=['race']) | |
tmp = tmp[['age','race_black','race_hispan','nodegree','re74','re75']] | |
X = sm.add_constant(tmp) | |
pd.Series([variance_inflation_factor(X.values, i) | |
for i in range(X.shape[1])], | |
index=X.columns) | |
# | |
# robust SEs | |
# | |
# basic regression lpm | |
results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit() | |
results.summary2() | |
results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit(cov_type = 'HC1') | |
results.summary2() | |
# similar results to R below: | |
# Estimate Std. Error t value Pr(>|t|) | |
#(Intercept) 0.369393334 0.055311541 6.68 0.000000000055 *** | |
#age -0.001014507 0.001802206 -0.56 0.57 | |
#nodegree 0.053119443 0.036936833 1.44 0.15 | |
#re74 -0.000016601 0.000003369 -4.93 0.000001071947 *** | |
#re75 0.000000845 0.000006455 0.13 0.90 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment