BioSciEconomist · March 5, 2021 00:30
diff --git a/ex RDD logistic.py b/ex RDD logistic.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar  4 17:23:08 2021

 @author: mattbogard
 """

 #*-----------------------------------------------------------------
 # | PROGRAM NAME: ex RDD logistic.py
 # | DATE: 2/26/21 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:            
 # *----------------------------------------------------------------
 # | PURPOSE: example code for RDD
 # *----------------------------------------------------------------


 # code based on: https://matheusfacure.github.io/python-causality-handbook/16-Regression-Discontinuity-Design.html

 # data simualted in: https://gist.github.com/BioSciEconomist/68bf299688d8f12bd009d5a7943a90f6 


 import pandas as pd
 import numpy as np
 from matplotlib import style
 from matplotlib import pyplot as plt
 import statsmodels.formula.api as smf
 import statsmodels.api as sm


 style.use("fivethirtyeight")

 # home grown marginal effects function for logit model with 2 variables

 def mfx(result,mu1,mu2,par):
    """
    result: model object from stats models logistic regression
    ex: y ~ b0 + b1*x1 + b2*x2
    mu1: mean value for first variable in model
    mu2: mean value for 2nd variable in model
    par: indicates index from 0 for model parameter you want to convert to a 
         marginal effect
    note: this easily extends to more variables but does not handle predictors
          with multiple categories (unless they are dummy coded)
    """
    b0 =  result.params[0]  
    b1 =  result.params[1] 
    b2 =  result.params[2] 
    XB = mu1*b1 + mu2*b2 + b0 
    return (np.exp(XB)/((1+np.exp(XB))**2))*result.params[par]


 #-------------------------------
 # read simulated data and explore
 #------------------------------

 # data simulated in R see code: https://gist.github.com/BioSciEconomist/7b0a3628cf70568fe158e140016cad8f
 df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/logit_RDD.csv')
 df.head()

 # visualize simulated data and probabilities by running variable and cutoff
 df.plot.scatter(x="x", y="p")

 # for a basline, run linear model on full data
 smf.ols('y ~ D + x', data=df ).fit().summary() 


 #----------------------------------
 # RDD
 #----------------------------------

 # we can use the python package to find the optimal bandwidth
 from rdd import rdd

 # identify optimal bandwidth on running variable x
 bandwidth_opt = rdd.optimal_bandwidth(df['y'], df['x'], cut= 50)
 print("Optimal bandwidth:", bandwidth_opt) # this matched bw from R


 # define kernel weighting (triangular)
 def kernel(R, c, h):
    indicator = (np.abs(R-c) <= h).astype(float)
    return indicator * (1 - np.abs(R-c)/h)

 plt.plot(df["x"], kernel(df["x"], c=50, h=2.29))
 plt.xlabel("x")
 plt.ylabel("Weight")
 plt.title("Kernel Weight by x");


 # restrict data to optimal bandwidth
 data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50)
 data_rdd.shape # check


 # run RDD model with optimal bw
 model = smf.wls("y ~ D + x", data_rdd,
                weights=kernel(data_rdd["x"], c=50, h=bandwidth_opt)).fit()

 model.summary()

 # run models with half bw
 data_rdd = rdd.truncated_data(df, 'x', (.5*bandwidth_opt), cut=50)
 model = smf.wls("y ~ D + x", data_rdd,
                weights=kernel(data_rdd["x"], c=50, h=.5*bandwidth_opt)).fit()

 model.summary()


 # run models with double bw
 data_rdd = rdd.truncated_data(df, 'x', (2*bandwidth_opt), cut=50)
 model = smf.wls("y ~ D + x", data_rdd,
                weights=kernel(data_rdd["x"], c=50, h=4.59)).fit()

 model.summary()

 #------------------------------------------
 # basic regression with optimal bandwidth
 #-----------------------------------------

 data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50)
 data_rdd.shape # check

 smf.ols('y ~ D + x', data=data_rdd ).fit().summary() 

 #---------------------------------------
 # logistic regression with optimal bandwidth
 #---------------------------------------


 model = smf.logit(formula='y ~ D + x', data=data_rdd).fit()
 model.summary()

 print(model.get_margeff().summary()) # get marginal effects 

 # marginal effects from logistic regressin are equivalent to 
 # results from RDD above using linear model

 mfx(model,.50,50,1) # home grown function gives similar result



 #--------------------------------------
 # kernel weighted logistic regression
 #--------------------------------------


 wts = kernel(data_rdd["x"], c=50, h=bandwidth_opt)
 fit = smf.glm("y ~ D + x",family=sm.families.Binomial(),data=data_rdd,freq_weights=wts).fit()
 fit.summary()

 mfx(fit,.50,50,1) # call function (get_margeff() does not work with this class)

 # marginal effects from kernel weighted logistic regression are nearly identitical
 # to results from the RDD above using a linear model and the kernel weighted
 # RDD above
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Thu Mar 4 17:23:08 2021

	@author: mattbogard
	"""

	#*-----------------------------------------------------------------
	# \| PROGRAM NAME: ex RDD logistic.py
	# \| DATE: 2/26/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: example code for RDD
	# *----------------------------------------------------------------


	# code based on: https://matheusfacure.github.io/python-causality-handbook/16-Regression-Discontinuity-Design.html

	# data simualted in: https://gist.github.com/BioSciEconomist/68bf299688d8f12bd009d5a7943a90f6


	import pandas as pd
	import numpy as np
	from matplotlib import style
	from matplotlib import pyplot as plt
	import statsmodels.formula.api as smf
	import statsmodels.api as sm


	style.use("fivethirtyeight")

	# home grown marginal effects function for logit model with 2 variables

	def mfx(result,mu1,mu2,par):
	"""
	result: model object from stats models logistic regression
	ex: y ~ b0 + b1x1 + b2x2
	mu1: mean value for first variable in model
	mu2: mean value for 2nd variable in model
	par: indicates index from 0 for model parameter you want to convert to a
	marginal effect
	note: this easily extends to more variables but does not handle predictors
	with multiple categories (unless they are dummy coded)
	"""
	b0 = result.params[0]
	b1 = result.params[1]
	b2 = result.params[2]
	XB = mu1b1 + mu2b2 + b0
	return (np.exp(XB)/((1+np.exp(XB))*2))result.params[par]


	#-------------------------------
	# read simulated data and explore
	#------------------------------

	# data simulated in R see code: https://gist.github.com/BioSciEconomist/7b0a3628cf70568fe158e140016cad8f
	df = pd.read_csv('/Users/mattbogard/Google Drive/Python Scripts/logit_RDD.csv')
	df.head()

	# visualize simulated data and probabilities by running variable and cutoff
	df.plot.scatter(x="x", y="p")

	# for a basline, run linear model on full data
	smf.ols('y ~ D + x', data=df ).fit().summary()


	#----------------------------------
	# RDD
	#----------------------------------

	# we can use the python package to find the optimal bandwidth
	from rdd import rdd

	# identify optimal bandwidth on running variable x
	bandwidth_opt = rdd.optimal_bandwidth(df['y'], df['x'], cut= 50)
	print("Optimal bandwidth:", bandwidth_opt) # this matched bw from R


	# define kernel weighting (triangular)
	def kernel(R, c, h):
	indicator = (np.abs(R-c) <= h).astype(float)
	return indicator * (1 - np.abs(R-c)/h)

	plt.plot(df["x"], kernel(df["x"], c=50, h=2.29))
	plt.xlabel("x")
	plt.ylabel("Weight")
	plt.title("Kernel Weight by x");


	# restrict data to optimal bandwidth
	data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50)
	data_rdd.shape # check


	# run RDD model with optimal bw
	model = smf.wls("y ~ D + x", data_rdd,
	weights=kernel(data_rdd["x"], c=50, h=bandwidth_opt)).fit()

	model.summary()

	# run models with half bw
	data_rdd = rdd.truncated_data(df, 'x', (.5*bandwidth_opt), cut=50)
	model = smf.wls("y ~ D + x", data_rdd,
	weights=kernel(data_rdd["x"], c=50, h=.5*bandwidth_opt)).fit()

	model.summary()


	# run models with double bw
	data_rdd = rdd.truncated_data(df, 'x', (2*bandwidth_opt), cut=50)
	model = smf.wls("y ~ D + x", data_rdd,
	weights=kernel(data_rdd["x"], c=50, h=4.59)).fit()

	model.summary()

	#------------------------------------------
	# basic regression with optimal bandwidth
	#-----------------------------------------

	data_rdd = rdd.truncated_data(df, 'x', bandwidth_opt, cut=50)
	data_rdd.shape # check

	smf.ols('y ~ D + x', data=data_rdd ).fit().summary()

	#---------------------------------------
	# logistic regression with optimal bandwidth
	#---------------------------------------


	model = smf.logit(formula='y ~ D + x', data=data_rdd).fit()
	model.summary()

	print(model.get_margeff().summary()) # get marginal effects

	# marginal effects from logistic regressin are equivalent to
	# results from RDD above using linear model

	mfx(model,.50,50,1) # home grown function gives similar result



	#--------------------------------------
	# kernel weighted logistic regression
	#--------------------------------------


	wts = kernel(data_rdd["x"], c=50, h=bandwidth_opt)
	fit = smf.glm("y ~ D + x",family=sm.families.Binomial(),data=data_rdd,freq_weights=wts).fit()
	fit.summary()

	mfx(fit,.50,50,1) # call function (get_margeff() does not work with this class)

	# marginal effects from kernel weighted logistic regression are nearly identitical
	# to results from the RDD above using a linear model and the kernel weighted
	# RDD above