Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active November 8, 2019 13:44
Show Gist options
  • Save BioSciEconomist/eeb0b5ebde392b01ef66976758ddafd2 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/eeb0b5ebde392b01ef66976758ddafd2 to your computer and use it in GitHub Desktop.
Basic bootstrapping
# *-----------------------------------------------------------------
# | PROGRAM NAME: basic boot.R
# | DATE: 11/9/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: demo R boot package and home grown bootsrapping for differences in means and difference-in-differences
# *----------------------------------------------------------------
#---------------------------------
# baseline OLS model
#---------------------------------
# set linear model baseline
summary(lm(ERVisits ~ treat, data = df_chrt))
# 95% CI
est <- -.266
std.err <- .363
LL = est - 1.96 * std.err
UB = est + 1.96*std.err
ci <- cbind(Estmiate = est, "SE" = std.err,LL,UB)
print(ci)
rm(est,std.err,LL,UB)
#---------------------------------
# roll your own bs
#---------------------------------
# define treatment and control groups for separate bootstraps
treat <- df_chrt_match[df_chrt$treat == 1,]
ctrl <- df_chrt_match[df_chrt$treat == 0,]
t0 <- mean(treat$ERVisits) - mean(ctrl$ERVisits) # point estimate
set.seed(1234567)
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:10000) {
sample1 <- treat[sample(nrow(treat), nrow(treat), replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), nrow(ctrl), replace = T), ] # sample from control group
# sample from data with replacement (this is the bs sample)
bstrap <- c(bstrap, (mean(sample1$ERVisits) - mean(sample2$ERVisits))) # calculate estimate from bs sample
}
hist(bstrap) # view bs distribution
summary(bstrap) # summary stats
sd(bstrap, na.rm = TRUE) # standard error
quantile(bstrap, c(.025,.975),na.rm = TRUE) # 95% bootstrapped CI
results <- data.frame(bstrap)
b.under.H0 <- results$bstrap - mean(results$bstrap) # p-value based on bs dist
mean(abs(b.under.H0) > abs(t0))
rm(bstrap,treat, ctrl, sample1, sample2) # cleanup
##################################################
# compare to basic boot from boot package
##################################################
# function to bootstrap the dummy variable regression estimate of differences in means
# this is different from above where we were pulling separate samples from the treatment and control groups
diff <- function(formula, data, indices) {
d <- data[indices,] # allows boot to select sample
fit <- lm(formula, data=d)
return(coef(fit)[2]) # get the coefficient for treatment (this gives mean differences)
}
# bootstrapping with 10000 replications
results <- boot(data=df_chrt_match, statistic=diff,
R=10000, formula=ERVisits ~ treat)
# view results
plot(results)
# get 95% bs confidence interval
boot.ci(results, type="all") # matches percentile method from above & same standard error we got above
b.under.H0 <- results$t - mean(results$t)
mean(abs(b.under.H0) > abs(results$t0)) # same p-value as above
#-----------------------------------------
# example for difference-in-differences via R bloggers
#---------------------------------------
# Reference: https://www.r-bloggers.com/bootstrapping-standard-errors-for-difference-in-differences-estimation-with-r/
run_DiD <- function(my_data, indices){
d <- my_data[indices,]
return(
mean(d$rprice[d$year==1981 & d$nearinc==1]) -
mean(d$rprice[d$year==1981 & d$nearinc==0]) -
(mean(d$rprice[d$year==1978 & d$nearinc==1]) -
mean(d$rprice[d$year==1978 & d$nearinc==0]))
)
}
boot_est <- boot(data, run_DiD, R=1000, parallel="multicore", ncpus = 2)
#### using our example above: (note data was restructured
run_DiD <- function(my_data, indices){
d <- my_data[indices,]
return(
(mean(d$ERVisits[d$treat == 1]) - mean(d$ERVisits[d$treat == 1])) -
(mean(d$ERVisits[d$treat == 0]) - mean(d$ERVisits[d$treat == 0]) )
)
}
# bootstrapping with 10000 replications
set.seed(123456)
results <- boot(data=df_chrt, statistic=run_DiD,
R=10000)
print(results)
# view results
plot(results)
# get 95% bs confidence interval
boot.ci(results, type="all")
# adhoc bs p-value
b.under.H0 <- results$t - mean(results$t)
mean(abs(b.under.H0) > abs(results$t0))
#-----------------------------------------
# roll your own
#---------------------------------------
treat <- df_chrt[df_chrt$treat == 1,]
ctrl <- df_chrt[df_chrt$treat == 0,]
t0 <- (mean(treat$Post_ERVisits) - mean(treat$Pre_ERVisits)) -
(mean(ctrl$Post_ERVisits) - mean(ctrl$Pre_ERVisits) )
set.seed(1234567)
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:10000) {
sample1 <- treat[sample(nrow(treat), nrow(treat), replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), nrow(ctrl), replace = T), ] # sample from control group
# sample from data with replacement (this is the bs sample)
bstrap <- c(bstrap,(mean(sample1$Post_ERVisits) - mean(sample1$Pre_ERVisits)) -
(mean(sample2$Post_ERVisits) - mean(sample2$Pre_ERVisits) ) ) # calculate estimate from bs sample
}
hist(bstrap) # view bs distribution
summary(bstrap) # summary stats
sd(bstrap, na.rm = TRUE) # standard error
quantile(bstrap, c(.025,.975),na.rm = TRUE) # 95% bootstrapped CI
bsresults <- data.frame(bstrap)
b.under.H0 <- bsresults$bstrap - mean(bsresults$bstrap) # p-value based on bs dist
mean(abs(b.under.H0) > abs(t0))
rm(bstrap,treat, ctrl, sample1, sample2) # cleanup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment