BioSciEconomist · March 1, 2021 18:47
diff --git a/ex RDD.R b/ex RDD.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex RDD.R 
 # | DATE: 2/26/21 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:            
 # *----------------------------------------------------------------
 # | PURPOSE: example RDD estimates & visualization vs OLS regressions near the cutoff
 # *----------------------------------------------------------------



 #----------------------
 # example 1
 #----------------------

 # simualte data
 set.seed(123)
 x<-runif(1000,-1,1) # running varaible with cutoff
 cov<-rnorm(1000) # controls 
 y<-3+2*x+3*cov+10*(x>=0)+rnorm(1000) # outcome with discontinuity at cutoff (x > 0)
 dat <- data.frame(y,x,cov) # create data frame
 dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0)

 # save for other work
 # write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example1_RDD.csv", row.names = FALSE)

 # some descriptives
 summary(dat)

 dat%>% 
  group_by(D) %>%
  summarize(y = mean(y),
            n = n()) 

 # visualize
 ggplot(dat,aes(x,y,color = D)) + geom_point() 

 summary(lm(y~D + x,data = dat)) #  basic estimation of treatment effects using all of the data

 # subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw = .25)
 tmp <- dat[dat$x > -.9 & dat$x < .9,]
 ggplot(tmp,aes(x,y,color = D)) + geom_point() 

 tmp%>% 
  group_by(D) %>%
  summarize(y = mean(y),
            n = n()) 

 summary(lm(y~ D + x,data = tmp)) # linear model (notice this is much smaller)

 # RDD estimate with optimal kernal bw
 summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL,
           kernel = "triangular", se.type = "HC1", cluster = NULL,
           verbose = FALSE, model = FALSE, frame = FALSE))

 # manually set BW
 summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .9,
                   kernel = "triangular", se.type = "HC1", cluster = NULL,
                   verbose = FALSE, model = FALSE, frame = FALSE))

 # notice this is closer to the regression on the subset of data vs full data

 dev.off


 #----------------------------------
 # example 2
 #----------------------------------


 # see also: https://mixtape.scunning.com/regression-discontinuity.html 
 library(tidyverse)

 # simulate the data
 dat <- tibble(
  x = rnorm(1000, 50, 25)
 ) %>%
  mutate(
    x = if_else(x < 0, 0, x)
  ) %>%
  filter(x < 100)

 # cutoff at x = 50
 dat <- dat %>% 
  mutate(
    D  = if_else(x > 50, 1, 0),
    y2 = 25 + 40 * D + 1.5 * x + rnorm(n(), 0, 20)
  )

 head(dat)

 summary(dat)

 # save for other work
 # write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example2_RDD.csv", row.names = FALSE)


 # visualize
 ggplot(aes(x, y2, colour = factor(D)), data = dat) +
  geom_point(alpha = 0.5) +
  geom_vline(xintercept = 50, colour = "grey", linetype = 2) +
  stat_smooth(method = "lm", se = F) +
  labs(x = "Test score (X)", y = "Potential Outcome (Y)")

 dat%>% 
  group_by(D) %>%
  summarize(y = mean(y2),
            n = n()) 

 summary(lm(y2 ~ D + x, data = dat)) # standard linear model

 # subset near the cutoff and estimate treatment effect (this is like apoor man's kernel given 100% weight within bw)
 tmp <- dat[dat$x > 45 & dat$x < 55,]
 ggplot(aes(x, y2, colour = factor(D)), data = tmp) +
  geom_point(alpha = 0.5) +
  geom_vline(xintercept = 50, colour = "grey", linetype = 2) +
  stat_smooth(method = "lm", se = F) +
  labs(x = "Test score (X)", y = "Potential Outcome (Y)")

 tmp%>% 
  group_by(D) %>%
  summarize(y = mean(y2),
            n = n()) 

 summary(lm(y2~D + x,data = tmp)) # linear model

 # RDD estimate with optimal kernel
 summary(RDestimate(y2 ~ x, dat, subset = NULL, cutpoint = 50, bw = NULL,
           kernel = "triangular", se.type = "HC1", cluster = NULL,
           verbose = FALSE, model = FALSE, frame = FALSE))

 # notice this is closer to the regression on the subset of data vs full data

 #------------------------------
 # example 3
 #------------------------------

 # basic logit simulation
 # set.seed(123)
 # x1 = rnorm(1000)           # some continuous variables 
 # z = 1 + 2*x1        # linear combination with a bias
 # pr = 1/(1+exp(-z))         # pass through an inv-logit function
 # y = rbinom(1000,1,pr)      # bernoulli response variable
 # 
 # dat <- data.frame(x1,z,pr,y)
 # 
 # summary(dat)

 # simualte data based on linear probability function
 set.seed(123)
 x<-runif(1000,-1,1) # running varaible with cutoff
 pr <- .25 + .25*x +.15*(x>=0) + rnorm(1000,0,.05) # outcome with discontinuity at cutoff (x > 0)
 y = rbinom(1000,1,pr)      # bernoulli response variable
 dat <- data.frame(pr,x,cov,y) # create data frame
 dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0)

 # remove missing
 dat <- na.omit(dat)

 # save for other work
 # write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/ex_RDD.csv", row.names = FALSE)

 # some descriptives
 summary(dat)

 dat%>% 
  group_by(D) %>%
  summarize(y = mean(pr),
            n = n())

 # visualize
 ggplot(dat,aes(x,pr,color = D)) + geom_point() # latent probabilities
 #ggplot(dat,aes(x,y,color = D)) + geom_point()  # outcome

 summary(lm(y~D + x,data = dat)) 

 # subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw)
 tmp <- dat[dat$x > -.5 & dat$x < .5,]
 ggplot(tmp,aes(x,y,color = D)) + geom_point() # outcome
 ggplot(tmp,aes(x,pr,color = D)) + geom_point() # latent probabilities

 tmp%>% 
  group_by(D) %>%
  summarize(y = mean(y),
            n = n()) 

 summary(lm(y~D + x,data = tmp)) # linear model (notice this is much smaller)

 # RDD estimate with optimal kernal bw
 est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL,
           kernel = "triangular", se.type = "HC1", cluster = NULL,
           verbose = FALSE, model = FALSE, frame = FALSE)
 summary(est_RDD)

 # RDD with manually selected bw
 est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .5,
                      kernel = "triangular", se.type = "HC1", cluster = NULL,
                      verbose = FALSE, model = FALSE, frame = FALSE)
 summary(est_RDD)


 #--------------------------------
 # logistic regression - without kernal weighting
 #-------------------------------

 mylogit <- glm(y~D + x,family=binomial(link='logit'),data=dat)
 summary(mylogit)

 # function for calculating marginal effects
 mfx <- function(x){
  pdf <- mean(dlogis(predict(x, type = "link")))
  marginal.effects <- pdf*coef(x)
  print("Marginal Effects")
  return(marginal.effects)
 }

 mfx(mylogit) # results

 summary(lm(y~D + x,data = dat)) # compare to OLS


 # do marginal effects change if we  manually narrow the bandwidth 

 mylogit <- glm(y~D + x,family=binomial(link='logit'),data=tmp)
 summary(mylogit)
 mfx(mylogit)


 # compare again to linear model
 summary(lm(y~D + x,data = tmp)) # very similar results
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex RDD.R
	# \| DATE: 2/26/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: example RDD estimates & visualization vs OLS regressions near the cutoff
	# *----------------------------------------------------------------



	#----------------------
	# example 1
	#----------------------

	# simualte data
	set.seed(123)
	x<-runif(1000,-1,1) # running varaible with cutoff
	cov<-rnorm(1000) # controls
	y<-3+2x+3cov+10*(x>=0)+rnorm(1000) # outcome with discontinuity at cutoff (x > 0)
	dat <- data.frame(y,x,cov) # create data frame
	dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0)

	# save for other work
	# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example1_RDD.csv", row.names = FALSE)

	# some descriptives
	summary(dat)

	dat%>%
	group_by(D) %>%
	summarize(y = mean(y),
	n = n())

	# visualize
	ggplot(dat,aes(x,y,color = D)) + geom_point()

	summary(lm(y~D + x,data = dat)) # basic estimation of treatment effects using all of the data

	# subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw = .25)
	tmp <- dat[dat$x > -.9 & dat$x < .9,]
	ggplot(tmp,aes(x,y,color = D)) + geom_point()

	tmp%>%
	group_by(D) %>%
	summarize(y = mean(y),
	n = n())

	summary(lm(y~ D + x,data = tmp)) # linear model (notice this is much smaller)

	# RDD estimate with optimal kernal bw
	summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL,
	kernel = "triangular", se.type = "HC1", cluster = NULL,
	verbose = FALSE, model = FALSE, frame = FALSE))

	# manually set BW
	summary(RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .9,
	kernel = "triangular", se.type = "HC1", cluster = NULL,
	verbose = FALSE, model = FALSE, frame = FALSE))

	# notice this is closer to the regression on the subset of data vs full data

	dev.off


	#----------------------------------
	# example 2
	#----------------------------------


	# see also: https://mixtape.scunning.com/regression-discontinuity.html
	library(tidyverse)

	# simulate the data
	dat <- tibble(
	x = rnorm(1000, 50, 25)
	) %>%
	mutate(
	x = if_else(x < 0, 0, x)
	) %>%
	filter(x < 100)

	# cutoff at x = 50
	dat <- dat %>%
	mutate(
	D = if_else(x > 50, 1, 0),
	y2 = 25 + 40 * D + 1.5 * x + rnorm(n(), 0, 20)
	)

	head(dat)

	summary(dat)

	# save for other work
	# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/example2_RDD.csv", row.names = FALSE)


	# visualize
	ggplot(aes(x, y2, colour = factor(D)), data = dat) +
	geom_point(alpha = 0.5) +
	geom_vline(xintercept = 50, colour = "grey", linetype = 2) +
	stat_smooth(method = "lm", se = F) +
	labs(x = "Test score (X)", y = "Potential Outcome (Y)")

	dat%>%
	group_by(D) %>%
	summarize(y = mean(y2),
	n = n())

	summary(lm(y2 ~ D + x, data = dat)) # standard linear model

	# subset near the cutoff and estimate treatment effect (this is like apoor man's kernel given 100% weight within bw)
	tmp <- dat[dat$x > 45 & dat$x < 55,]
	ggplot(aes(x, y2, colour = factor(D)), data = tmp) +
	geom_point(alpha = 0.5) +
	geom_vline(xintercept = 50, colour = "grey", linetype = 2) +
	stat_smooth(method = "lm", se = F) +
	labs(x = "Test score (X)", y = "Potential Outcome (Y)")

	tmp%>%
	group_by(D) %>%
	summarize(y = mean(y2),
	n = n())

	summary(lm(y2~D + x,data = tmp)) # linear model

	# RDD estimate with optimal kernel
	summary(RDestimate(y2 ~ x, dat, subset = NULL, cutpoint = 50, bw = NULL,
	kernel = "triangular", se.type = "HC1", cluster = NULL,
	verbose = FALSE, model = FALSE, frame = FALSE))

	# notice this is closer to the regression on the subset of data vs full data

	#------------------------------
	# example 3
	#------------------------------

	# basic logit simulation
	# set.seed(123)
	# x1 = rnorm(1000) # some continuous variables
	# z = 1 + 2*x1 # linear combination with a bias
	# pr = 1/(1+exp(-z)) # pass through an inv-logit function
	# y = rbinom(1000,1,pr) # bernoulli response variable
	#
	# dat <- data.frame(x1,z,pr,y)
	#
	# summary(dat)

	# simualte data based on linear probability function
	set.seed(123)
	x<-runif(1000,-1,1) # running varaible with cutoff
	pr <- .25 + .25x +.15(x>=0) + rnorm(1000,0,.05) # outcome with discontinuity at cutoff (x > 0)
	y = rbinom(1000,1,pr) # bernoulli response variable
	dat <- data.frame(pr,x,cov,y) # create data frame
	dat$D <- ifelse(dat$x > 0, 1,0) # flag 'treatment' group that exceeds threshold (0)

	# remove missing
	dat <- na.omit(dat)

	# save for other work
	# write.csv(dat, file = "/Users/mattbogard/Google Drive/Python Scripts/ex_RDD.csv", row.names = FALSE)

	# some descriptives
	summary(dat)

	dat%>%
	group_by(D) %>%
	summarize(y = mean(pr),
	n = n())

	# visualize
	ggplot(dat,aes(x,pr,color = D)) + geom_point() # latent probabilities
	#ggplot(dat,aes(x,y,color = D)) + geom_point() # outcome

	summary(lm(y~D + x,data = dat))

	# subset near the cutoff and estimate treatment effect (this is like a poor person's kernel given 100% weight within bw)
	tmp <- dat[dat$x > -.5 & dat$x < .5,]
	ggplot(tmp,aes(x,y,color = D)) + geom_point() # outcome
	ggplot(tmp,aes(x,pr,color = D)) + geom_point() # latent probabilities

	tmp%>%
	group_by(D) %>%
	summarize(y = mean(y),
	n = n())

	summary(lm(y~D + x,data = tmp)) # linear model (notice this is much smaller)

	# RDD estimate with optimal kernal bw
	est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = NULL,
	kernel = "triangular", se.type = "HC1", cluster = NULL,
	verbose = FALSE, model = FALSE, frame = FALSE)
	summary(est_RDD)

	# RDD with manually selected bw
	est_RDD <- RDestimate(y ~ x, dat, subset = NULL, cutpoint = 0, bw = .5,
	kernel = "triangular", se.type = "HC1", cluster = NULL,
	verbose = FALSE, model = FALSE, frame = FALSE)
	summary(est_RDD)


	#--------------------------------
	# logistic regression - without kernal weighting
	#-------------------------------

	mylogit <- glm(y~D + x,family=binomial(link='logit'),data=dat)
	summary(mylogit)

	# function for calculating marginal effects
	mfx <- function(x){
	pdf <- mean(dlogis(predict(x, type = "link")))
	marginal.effects <- pdf*coef(x)
	print("Marginal Effects")
	return(marginal.effects)
	}

	mfx(mylogit) # results

	summary(lm(y~D + x,data = dat)) # compare to OLS


	# do marginal effects change if we manually narrow the bandwidth

	mylogit <- glm(y~D + x,family=binomial(link='logit'),data=tmp)
	summary(mylogit)
	mfx(mylogit)


	# compare again to linear model
	summary(lm(y~D + x,data = tmp)) # very similar results