Created
March 5, 2021 00:30
-
-
Save BioSciEconomist/b65e530a3041228c1df2fa360ad17d31 to your computer and use it in GitHub Desktop.
Regression discontinuity model with binary outcome comparison of logistic regression and linear probability model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu Mar 4 17:23:08 2021 | |
@author: mattbogard | |
""" | |
#*----------------------------------------------------------------- | |
# | PROGRAM NAME: ex RDD logistic.py | |
# | DATE: 2/26/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: example code for RDD | |
# *---------------------------------------------------------------- | |
# code based on: https://matheusfacure.github.io/python-causality-handbook/16-Regression-Discontinuity-Design.html | |
# data simualted in: https://gist.github.com/BioSciEconomist/68bf299688d8f12bd009d5a7943a90f6 | |
import pandas as pd | |
import numpy as np | |
from matplotlib import style | |
from matplotlib import pyplot as plt | |
import statsmodels.formula.api as smf | |
import statsmodels.api as sm | |
style.use("fivethirtyeight") | |
# home grown marginal effects function for logit model with 2 variables | |
def mfx(result,mu1,mu2,par): | |
""" | |
result: model object from stats models logistic regression | |
ex: y ~ b0 + b1*x1 + b2*x2 | |
mu1: mean value for first variable in model | |
mu2: mean value for 2nd variable in model | |
par: indicates index from 0 for model parameter you want to convert to a | |
marginal effect | |
note: this easily extends to more variables but does not handle predictors | |
with multiple categories (unless they are dummy coded) | |
""" | |
b0 = result.params[0] | |
b1 = result.params[1] | |
b2 = result.params[2] | |
XB = mu1*b1 + mu2*b2 + b0 | |
return (np.exp(XB)/((1+np.exp(XB))**2))*result.params[par] | |
#------------------------------- | |
# read simulated data and explore | |
#------------------------------ | |
# data simulated in R see code: https://gist.github.com/BioSciEconomist/7b0a3628cf70568fe158e140016cad8f | |
df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/logit_RDD.csv') | |
df.head() | |
# visualize simulated data and probabilities by running variable and cutoff | |
df.plot.scatter(x="x", y="p") | |
# for a basline, run linear model on full data | |
smf.ols('y ~ D + x', data=df ).fit().summary() | |
#---------------------------------- | |
# RDD | |
#---------------------------------- | |
# we can use the python package to find the optimal bandwidth | |
from rdd import rdd | |
# identify optimal bandwidth on running variable x | |
bandwidth_opt = rdd.optimal_bandwidth(df['y'], df['x'], cut= 50) | |
print("Optimal bandwidth:", bandwidth_opt) # this matched bw from R | |
# define kernel weighting (triangular) | |
def kernel(R, c, h): | |
indicator = (np.abs(R-c) <= h).astype(float) | |
return indicator * (1 - np.abs(R-c)/h) | |
plt.plot(df["x"], kernel(df["x"], c=50, h=2.29)) | |
plt.xlabel("x") | |
plt.ylabel("Weight") | |
plt.title("Kernel Weight by x"); | |
# restrict data to optimal bandwidth | |
data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50) | |
data_rdd.shape # check | |
# run RDD model with optimal bw | |
model = smf.wls("y ~ D + x", data_rdd, | |
weights=kernel(data_rdd["x"], c=50, h=bandwidth_opt)).fit() | |
model.summary() | |
# run models with half bw | |
data_rdd = rdd.truncated_data(df, 'x', (.5*bandwidth_opt), cut=50) | |
model = smf.wls("y ~ D + x", data_rdd, | |
weights=kernel(data_rdd["x"], c=50, h=.5*bandwidth_opt)).fit() | |
model.summary() | |
# run models with double bw | |
data_rdd = rdd.truncated_data(df, 'x', (2*bandwidth_opt), cut=50) | |
model = smf.wls("y ~ D + x", data_rdd, | |
weights=kernel(data_rdd["x"], c=50, h=4.59)).fit() | |
model.summary() | |
#------------------------------------------ | |
# basic regression with optimal bandwidth | |
#----------------------------------------- | |
data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50) | |
data_rdd.shape # check | |
smf.ols('y ~ D + x', data=data_rdd ).fit().summary() | |
#--------------------------------------- | |
# logistic regression with optimal bandwidth | |
#--------------------------------------- | |
model = smf.logit(formula='y ~ D + x', data=data_rdd).fit() | |
model.summary() | |
print(model.get_margeff().summary()) # get marginal effects | |
# marginal effects from logistic regressin are equivalent to | |
# results from RDD above using linear model | |
mfx(model,.50,50,1) # home grown function gives similar result | |
#-------------------------------------- | |
# kernel weighted logistic regression | |
#-------------------------------------- | |
wts = kernel(data_rdd["x"], c=50, h=bandwidth_opt) | |
fit = smf.glm("y ~ D + x",family=sm.families.Binomial(),data=data_rdd,freq_weights=wts).fit() | |
fit.summary() | |
mfx(fit,.50,50,1) # call function (get_margeff() does not work with this class) | |
# marginal effects from kernel weighted logistic regression are nearly identitical | |
# to results from the RDD above using a linear model and the kernel weighted | |
# RDD above |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment