Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created March 5, 2021 00:30
Show Gist options
  • Save BioSciEconomist/b65e530a3041228c1df2fa360ad17d31 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/b65e530a3041228c1df2fa360ad17d31 to your computer and use it in GitHub Desktop.
Regression discontinuity model with binary outcome comparison of logistic regression and linear probability model
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 4 17:23:08 2021
@author: mattbogard
"""
#*-----------------------------------------------------------------
# | PROGRAM NAME: ex RDD logistic.py
# | DATE: 2/26/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: example code for RDD
# *----------------------------------------------------------------
# code based on: https://matheusfacure.github.io/python-causality-handbook/16-Regression-Discontinuity-Design.html
# data simualted in: https://gist.github.com/BioSciEconomist/68bf299688d8f12bd009d5a7943a90f6
import pandas as pd
import numpy as np
from matplotlib import style
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
style.use("fivethirtyeight")
# home grown marginal effects function for logit model with 2 variables
def mfx(result,mu1,mu2,par):
"""
result: model object from stats models logistic regression
ex: y ~ b0 + b1*x1 + b2*x2
mu1: mean value for first variable in model
mu2: mean value for 2nd variable in model
par: indicates index from 0 for model parameter you want to convert to a
marginal effect
note: this easily extends to more variables but does not handle predictors
with multiple categories (unless they are dummy coded)
"""
b0 = result.params[0]
b1 = result.params[1]
b2 = result.params[2]
XB = mu1*b1 + mu2*b2 + b0
return (np.exp(XB)/((1+np.exp(XB))**2))*result.params[par]
#-------------------------------
# read simulated data and explore
#------------------------------
# data simulated in R see code: https://gist.github.com/BioSciEconomist/7b0a3628cf70568fe158e140016cad8f
df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/logit_RDD.csv')
df.head()
# visualize simulated data and probabilities by running variable and cutoff
df.plot.scatter(x="x", y="p")
# for a basline, run linear model on full data
smf.ols('y ~ D + x', data=df ).fit().summary()
#----------------------------------
# RDD
#----------------------------------
# we can use the python package to find the optimal bandwidth
from rdd import rdd
# identify optimal bandwidth on running variable x
bandwidth_opt = rdd.optimal_bandwidth(df['y'], df['x'], cut= 50)
print("Optimal bandwidth:", bandwidth_opt) # this matched bw from R
# define kernel weighting (triangular)
def kernel(R, c, h):
indicator = (np.abs(R-c) <= h).astype(float)
return indicator * (1 - np.abs(R-c)/h)
plt.plot(df["x"], kernel(df["x"], c=50, h=2.29))
plt.xlabel("x")
plt.ylabel("Weight")
plt.title("Kernel Weight by x");
# restrict data to optimal bandwidth
data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50)
data_rdd.shape # check
# run RDD model with optimal bw
model = smf.wls("y ~ D + x", data_rdd,
weights=kernel(data_rdd["x"], c=50, h=bandwidth_opt)).fit()
model.summary()
# run models with half bw
data_rdd = rdd.truncated_data(df, 'x', (.5*bandwidth_opt), cut=50)
model = smf.wls("y ~ D + x", data_rdd,
weights=kernel(data_rdd["x"], c=50, h=.5*bandwidth_opt)).fit()
model.summary()
# run models with double bw
data_rdd = rdd.truncated_data(df, 'x', (2*bandwidth_opt), cut=50)
model = smf.wls("y ~ D + x", data_rdd,
weights=kernel(data_rdd["x"], c=50, h=4.59)).fit()
model.summary()
#------------------------------------------
# basic regression with optimal bandwidth
#-----------------------------------------
data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50)
data_rdd.shape # check
smf.ols('y ~ D + x', data=data_rdd ).fit().summary()
#---------------------------------------
# logistic regression with optimal bandwidth
#---------------------------------------
model = smf.logit(formula='y ~ D + x', data=data_rdd).fit()
model.summary()
print(model.get_margeff().summary()) # get marginal effects
# marginal effects from logistic regressin are equivalent to
# results from RDD above using linear model
mfx(model,.50,50,1) # home grown function gives similar result
#--------------------------------------
# kernel weighted logistic regression
#--------------------------------------
wts = kernel(data_rdd["x"], c=50, h=bandwidth_opt)
fit = smf.glm("y ~ D + x",family=sm.families.Binomial(),data=data_rdd,freq_weights=wts).fit()
fit.summary()
mfx(fit,.50,50,1) # call function (get_margeff() does not work with this class)
# marginal effects from kernel weighted logistic regression are nearly identitical
# to results from the RDD above using a linear model and the kernel weighted
# RDD above
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment