Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active March 1, 2021 18:47
Show Gist options
  • Save BioSciEconomist/68bf299688d8f12bd009d5a7943a90f6 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/68bf299688d8f12bd009d5a7943a90f6 to your computer and use it in GitHub Desktop.
Basic regression discontinuity in R
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex RDD.R
# | DATE: 2/26/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: example RDD estimates & visualization vs OLS regressions near the cutoff
# *----------------------------------------------------------------
#----------------------
# example 1
#----------------------
# simualte data
set.seed(123)
x<-runif(1000,-1,1) # running varaible with cutoff
cov<-rnorm(1000) # controls
y<-3+2*x+3*cov+10*(x>=0)+rnorm(1000) # outcome with discontinuity at cutoff (x > 0)
dat <- data.frame(y,x,cov) # create data frame
dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0)
# save for other work
# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example1_RDD.csv", row.names = FALSE)
# some descriptives
summary(dat)
dat%>%
group_by(D) %>%
summarize(y = mean(y),
n = n())
# visualize
ggplot(dat,aes(x,y,color = D)) + geom_point()
summary(lm(y~D + x,data = dat)) # basic estimation of treatment effects using all of the data
# subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw = .25)
tmp <- dat[dat$x > -.9 & dat$x < .9,]
ggplot(tmp,aes(x,y,color = D)) + geom_point()
tmp%>%
group_by(D) %>%
summarize(y = mean(y),
n = n())
summary(lm(y~ D + x,data = tmp)) # linear model (notice this is much smaller)
# RDD estimate with optimal kernal bw
summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL,
kernel = "triangular", se.type = "HC1", cluster = NULL,
verbose = FALSE, model = FALSE, frame = FALSE))
# manually set BW
summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .9,
kernel = "triangular", se.type = "HC1", cluster = NULL,
verbose = FALSE, model = FALSE, frame = FALSE))
# notice this is closer to the regression on the subset of data vs full data
dev.off
#----------------------------------
# example 2
#----------------------------------
# see also: https://mixtape.scunning.com/regression-discontinuity.html
library(tidyverse)
# simulate the data
dat <- tibble(
x = rnorm(1000, 50, 25)
) %>%
mutate(
x = if_else(x < 0, 0, x)
) %>%
filter(x < 100)
# cutoff at x = 50
dat <- dat %>%
mutate(
D = if_else(x > 50, 1, 0),
y2 = 25 + 40 * D + 1.5 * x + rnorm(n(), 0, 20)
)
head(dat)
summary(dat)
# save for other work
# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example2_RDD.csv", row.names = FALSE)
# visualize
ggplot(aes(x, y2, colour = factor(D)), data = dat) +
geom_point(alpha = 0.5) +
geom_vline(xintercept = 50, colour = "grey", linetype = 2) +
stat_smooth(method = "lm", se = F) +
labs(x = "Test score (X)", y = "Potential Outcome (Y)")
dat%>%
group_by(D) %>%
summarize(y = mean(y2),
n = n())
summary(lm(y2 ~ D + x, data = dat)) # standard linear model
# subset near the cutoff and estimate treatment effect (this is like apoor man's kernel given 100% weight within bw)
tmp <- dat[dat$x > 45 & dat$x < 55,]
ggplot(aes(x, y2, colour = factor(D)), data = tmp) +
geom_point(alpha = 0.5) +
geom_vline(xintercept = 50, colour = "grey", linetype = 2) +
stat_smooth(method = "lm", se = F) +
labs(x = "Test score (X)", y = "Potential Outcome (Y)")
tmp%>%
group_by(D) %>%
summarize(y = mean(y2),
n = n())
summary(lm(y2~D + x,data = tmp)) # linear model
# RDD estimate with optimal kernel
summary(RDestimate(y2 ~ x, dat, subset = NULL, cutpoint = 50, bw = NULL,
kernel = "triangular", se.type = "HC1", cluster = NULL,
verbose = FALSE, model = FALSE, frame = FALSE))
# notice this is closer to the regression on the subset of data vs full data
#------------------------------
# example 3
#------------------------------
# basic logit simulation
# set.seed(123)
# x1 = rnorm(1000) # some continuous variables
# z = 1 + 2*x1 # linear combination with a bias
# pr = 1/(1+exp(-z)) # pass through an inv-logit function
# y = rbinom(1000,1,pr) # bernoulli response variable
#
# dat <- data.frame(x1,z,pr,y)
#
# summary(dat)
# simualte data based on linear probability function
set.seed(123)
x<-runif(1000,-1,1) # running varaible with cutoff
pr <- .25 + .25*x +.15*(x>=0) + rnorm(1000,0,.05) # outcome with discontinuity at cutoff (x > 0)
y = rbinom(1000,1,pr) # bernoulli response variable
dat <- data.frame(pr,x,cov,y) # create data frame
dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0)
# remove missing
dat <- na.omit(dat)
# save for other work
# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/ex_RDD.csv", row.names = FALSE)
# some descriptives
summary(dat)
dat%>%
group_by(D) %>%
summarize(y = mean(pr),
n = n())
# visualize
ggplot(dat,aes(x,pr,color = D)) + geom_point() # latent probabilities
#ggplot(dat,aes(x,y,color = D)) + geom_point() # outcome
summary(lm(y~D + x,data = dat))
# subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw)
tmp <- dat[dat$x > -.5 & dat$x < .5,]
ggplot(tmp,aes(x,y,color = D)) + geom_point() # outcome
ggplot(tmp,aes(x,pr,color = D)) + geom_point() # latent probabilities
tmp%>%
group_by(D) %>%
summarize(y = mean(y),
n = n())
summary(lm(y~D + x,data = tmp)) # linear model (notice this is much smaller)
# RDD estimate with optimal kernal bw
est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL,
kernel = "triangular", se.type = "HC1", cluster = NULL,
verbose = FALSE, model = FALSE, frame = FALSE)
summary(est_RDD)
# RDD with manually selected bw
est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .5,
kernel = "triangular", se.type = "HC1", cluster = NULL,
verbose = FALSE, model = FALSE, frame = FALSE)
summary(est_RDD)
#--------------------------------
# logistic regression - without kernal weighting
#-------------------------------
mylogit <- glm(y~D + x,family=binomial(link='logit'),data=dat)
summary(mylogit)
# function for calculating marginal effects
mfx <- function(x){
pdf <- mean(dlogis(predict(x, type = "link")))
marginal.effects <- pdf*coef(x)
print("Marginal Effects")
return(marginal.effects)
}
mfx(mylogit) # results
summary(lm(y~D + x,data = dat)) # compare to OLS
# do marginal effects change if we manually narrow the bandwidth
mylogit <- glm(y~D + x,family=binomial(link='logit'),data=tmp)
summary(mylogit)
mfx(mylogit)
# compare again to linear model
summary(lm(y~D + x,data = tmp)) # very similar results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment