Last active
March 1, 2021 18:47
-
-
Save BioSciEconomist/68bf299688d8f12bd009d5a7943a90f6 to your computer and use it in GitHub Desktop.
Basic regression discontinuity in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex RDD.R | |
# | DATE: 2/26/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: example RDD estimates & visualization vs OLS regressions near the cutoff | |
# *---------------------------------------------------------------- | |
#---------------------- | |
# example 1 | |
#---------------------- | |
# simualte data | |
set.seed(123) | |
x<-runif(1000,-1,1) # running varaible with cutoff | |
cov<-rnorm(1000) # controls | |
y<-3+2*x+3*cov+10*(x>=0)+rnorm(1000) # outcome with discontinuity at cutoff (x > 0) | |
dat <- data.frame(y,x,cov) # create data frame | |
dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0) | |
# save for other work | |
# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example1_RDD.csv", row.names = FALSE) | |
# some descriptives | |
summary(dat) | |
dat%>% | |
group_by(D) %>% | |
summarize(y = mean(y), | |
n = n()) | |
# visualize | |
ggplot(dat,aes(x,y,color = D)) + geom_point() | |
summary(lm(y~D + x,data = dat)) # basic estimation of treatment effects using all of the data | |
# subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw = .25) | |
tmp <- dat[dat$x > -.9 & dat$x < .9,] | |
ggplot(tmp,aes(x,y,color = D)) + geom_point() | |
tmp%>% | |
group_by(D) %>% | |
summarize(y = mean(y), | |
n = n()) | |
summary(lm(y~ D + x,data = tmp)) # linear model (notice this is much smaller) | |
# RDD estimate with optimal kernal bw | |
summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL, | |
kernel = "triangular", se.type = "HC1", cluster = NULL, | |
verbose = FALSE, model = FALSE, frame = FALSE)) | |
# manually set BW | |
summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .9, | |
kernel = "triangular", se.type = "HC1", cluster = NULL, | |
verbose = FALSE, model = FALSE, frame = FALSE)) | |
# notice this is closer to the regression on the subset of data vs full data | |
dev.off | |
#---------------------------------- | |
# example 2 | |
#---------------------------------- | |
# see also: https://mixtape.scunning.com/regression-discontinuity.html | |
library(tidyverse) | |
# simulate the data | |
dat <- tibble( | |
x = rnorm(1000, 50, 25) | |
) %>% | |
mutate( | |
x = if_else(x < 0, 0, x) | |
) %>% | |
filter(x < 100) | |
# cutoff at x = 50 | |
dat <- dat %>% | |
mutate( | |
D = if_else(x > 50, 1, 0), | |
y2 = 25 + 40 * D + 1.5 * x + rnorm(n(), 0, 20) | |
) | |
head(dat) | |
summary(dat) | |
# save for other work | |
# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example2_RDD.csv", row.names = FALSE) | |
# visualize | |
ggplot(aes(x, y2, colour = factor(D)), data = dat) + | |
geom_point(alpha = 0.5) + | |
geom_vline(xintercept = 50, colour = "grey", linetype = 2) + | |
stat_smooth(method = "lm", se = F) + | |
labs(x = "Test score (X)", y = "Potential Outcome (Y)") | |
dat%>% | |
group_by(D) %>% | |
summarize(y = mean(y2), | |
n = n()) | |
summary(lm(y2 ~ D + x, data = dat)) # standard linear model | |
# subset near the cutoff and estimate treatment effect (this is like apoor man's kernel given 100% weight within bw) | |
tmp <- dat[dat$x > 45 & dat$x < 55,] | |
ggplot(aes(x, y2, colour = factor(D)), data = tmp) + | |
geom_point(alpha = 0.5) + | |
geom_vline(xintercept = 50, colour = "grey", linetype = 2) + | |
stat_smooth(method = "lm", se = F) + | |
labs(x = "Test score (X)", y = "Potential Outcome (Y)") | |
tmp%>% | |
group_by(D) %>% | |
summarize(y = mean(y2), | |
n = n()) | |
summary(lm(y2~D + x,data = tmp)) # linear model | |
# RDD estimate with optimal kernel | |
summary(RDestimate(y2 ~ x, dat, subset = NULL, cutpoint = 50, bw = NULL, | |
kernel = "triangular", se.type = "HC1", cluster = NULL, | |
verbose = FALSE, model = FALSE, frame = FALSE)) | |
# notice this is closer to the regression on the subset of data vs full data | |
#------------------------------ | |
# example 3 | |
#------------------------------ | |
# basic logit simulation | |
# set.seed(123) | |
# x1 = rnorm(1000) # some continuous variables | |
# z = 1 + 2*x1 # linear combination with a bias | |
# pr = 1/(1+exp(-z)) # pass through an inv-logit function | |
# y = rbinom(1000,1,pr) # bernoulli response variable | |
# | |
# dat <- data.frame(x1,z,pr,y) | |
# | |
# summary(dat) | |
# simualte data based on linear probability function | |
set.seed(123) | |
x<-runif(1000,-1,1) # running varaible with cutoff | |
pr <- .25 + .25*x +.15*(x>=0) + rnorm(1000,0,.05) # outcome with discontinuity at cutoff (x > 0) | |
y = rbinom(1000,1,pr) # bernoulli response variable | |
dat <- data.frame(pr,x,cov,y) # create data frame | |
dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0) | |
# remove missing | |
dat <- na.omit(dat) | |
# save for other work | |
# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/ex_RDD.csv", row.names = FALSE) | |
# some descriptives | |
summary(dat) | |
dat%>% | |
group_by(D) %>% | |
summarize(y = mean(pr), | |
n = n()) | |
# visualize | |
ggplot(dat,aes(x,pr,color = D)) + geom_point() # latent probabilities | |
#ggplot(dat,aes(x,y,color = D)) + geom_point() # outcome | |
summary(lm(y~D + x,data = dat)) | |
# subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw) | |
tmp <- dat[dat$x > -.5 & dat$x < .5,] | |
ggplot(tmp,aes(x,y,color = D)) + geom_point() # outcome | |
ggplot(tmp,aes(x,pr,color = D)) + geom_point() # latent probabilities | |
tmp%>% | |
group_by(D) %>% | |
summarize(y = mean(y), | |
n = n()) | |
summary(lm(y~D + x,data = tmp)) # linear model (notice this is much smaller) | |
# RDD estimate with optimal kernal bw | |
est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL, | |
kernel = "triangular", se.type = "HC1", cluster = NULL, | |
verbose = FALSE, model = FALSE, frame = FALSE) | |
summary(est_RDD) | |
# RDD with manually selected bw | |
est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .5, | |
kernel = "triangular", se.type = "HC1", cluster = NULL, | |
verbose = FALSE, model = FALSE, frame = FALSE) | |
summary(est_RDD) | |
#-------------------------------- | |
# logistic regression - without kernal weighting | |
#------------------------------- | |
mylogit <- glm(y~D + x,family=binomial(link='logit'),data=dat) | |
summary(mylogit) | |
# function for calculating marginal effects | |
mfx <- function(x){ | |
pdf <- mean(dlogis(predict(x, type = "link"))) | |
marginal.effects <- pdf*coef(x) | |
print("Marginal Effects") | |
return(marginal.effects) | |
} | |
mfx(mylogit) # results | |
summary(lm(y~D + x,data = dat)) # compare to OLS | |
# do marginal effects change if we manually narrow the bandwidth | |
mylogit <- glm(y~D + x,family=binomial(link='logit'),data=tmp) | |
summary(mylogit) | |
mfx(mylogit) | |
# compare again to linear model | |
summary(lm(y~D + x,data = tmp)) # very similar results | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment