BioSciEconomist · November 8, 2019 13:44
diff --git a/boot basic.R b/boot basic.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: basic boot.R
 # | DATE: 11/9/19 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:        
 # *----------------------------------------------------------------
 # | PURPOSE: demo R boot package and home grown bootsrapping for differences in means and difference-in-differences
 # *----------------------------------------------------------------

 #---------------------------------
 # baseline OLS model
 #---------------------------------

 # set linear model baseline
 summary(lm(ERVisits ~ treat, data = df_chrt))

 # 95% CI 
 est <- -.266
 std.err <- .363
 LL = est  - 1.96 * std.err
 UB = est + 1.96*std.err

 ci <- cbind(Estmiate = est, "SE" = std.err,LL,UB)
 print(ci)
 rm(est,std.err,LL,UB)

 #---------------------------------
 # roll your own bs
 #---------------------------------

 # define treatment and control groups for separate bootstraps

 treat <- df_chrt_match[df_chrt$treat == 1,]
 ctrl <- df_chrt_match[df_chrt$treat == 0,]

 t0 <- mean(treat$ERVisits) - mean(ctrl$ERVisits) # point estimate

 set.seed(1234567) 
 bstrap <- c() # initialize vector for bootstrapped estimates
 for (i in 1:10000) {
  sample1 <- treat[sample(nrow(treat), nrow(treat), replace = T), ] # sample from treatment group
  sample2 <- ctrl[sample(nrow(ctrl), nrow(ctrl), replace = T), ]  # sample from control group
  # sample from data with replacement (this is the bs sample)
  bstrap <- c(bstrap, (mean(sample1$ERVisits) - mean(sample2$ERVisits))) # calculate estimate from bs sample
 }

 hist(bstrap) # view bs distribution
 summary(bstrap) # summary stats

 sd(bstrap, na.rm = TRUE)  # standard error
 quantile(bstrap, c(.025,.975),na.rm = TRUE)  # 95% bootstrapped CI

 results <- data.frame(bstrap)

 b.under.H0 <- results$bstrap - mean(results$bstrap) # p-value based on bs dist
 mean(abs(b.under.H0) > abs(t0))

 rm(bstrap,treat, ctrl, sample1, sample2) # cleanup


 ##################################################
 # compare to basic boot from boot package
 ##################################################

 # function to bootstrap the dummy variable regression estimate of differences in means
 # this is different from above where we were pulling separate samples from the treatment and control groups

 diff <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample 
  fit <- lm(formula, data=d)
  return(coef(fit)[2]) # get the coefficient for treatment (this gives mean differences)
 } 

 # bootstrapping with 10000 replications 
 results <- boot(data=df_chrt_match, statistic=diff, 
                R=10000, formula=ERVisits  ~  treat)

 # view results
 plot(results)

 # get 95% bs confidence interval 
 boot.ci(results, type="all")  # matches percentile method from above & same standard error we got above

 b.under.H0 <- results$t - mean(results$t) 
 mean(abs(b.under.H0) > abs(results$t0))   # same p-value as above

 #-----------------------------------------
 # example for difference-in-differences via R bloggers
 #---------------------------------------

 # Reference: https://www.r-bloggers.com/bootstrapping-standard-errors-for-difference-in-differences-estimation-with-r/

 run_DiD <- function(my_data, indices){
    d <- my_data[indices,]
    return(
        mean(d$rprice[d$year==1981 & d$nearinc==1]) - 
        mean(d$rprice[d$year==1981 & d$nearinc==0]) - 
        (mean(d$rprice[d$year==1978 & d$nearinc==1]) - 
        mean(d$rprice[d$year==1978 & d$nearinc==0]))
    )
 }

 boot_est <- boot(data, run_DiD, R=1000, parallel="multicore", ncpus = 2)

 #### using our example above: (note data was restructured
 run_DiD <- function(my_data, indices){
  d <- my_data[indices,]
  return(
    (mean(d$ERVisits[d$treat == 1]) - mean(d$ERVisits[d$treat == 1])) - 
      (mean(d$ERVisits[d$treat == 0]) - mean(d$ERVisits[d$treat == 0]) )
  )
 }

 # bootstrapping with 10000 replications 
 set.seed(123456)
 results <- boot(data=df_chrt, statistic=run_DiD, 
                R=10000)

 print(results)

 # view results
 plot(results)

 # get 95% bs confidence interval 
 boot.ci(results, type="all")

 # adhoc bs p-value
 b.under.H0 <- results$t - mean(results$t) 
 mean(abs(b.under.H0) > abs(results$t0))

 #-----------------------------------------
 # roll your own
 #---------------------------------------

 treat <- df_chrt[df_chrt$treat == 1,]
 ctrl <- df_chrt[df_chrt$treat == 0,]

 t0 <- (mean(treat$Post_ERVisits) - mean(treat$Pre_ERVisits)) - 
  (mean(ctrl$Post_ERVisits) - mean(ctrl$Pre_ERVisits) )

 set.seed(1234567) 
 bstrap <- c() # initialize vector for bootstrapped estimates
 for (i in 1:10000) {
  sample1 <- treat[sample(nrow(treat), nrow(treat), replace = T), ] # sample from treatment group
  sample2 <- ctrl[sample(nrow(ctrl), nrow(ctrl), replace = T), ]  # sample from control group
  # sample from data with replacement (this is the bs sample)
  bstrap <- c(bstrap,(mean(sample1$Post_ERVisits) - mean(sample1$Pre_ERVisits)) - 
                (mean(sample2$Post_ERVisits) - mean(sample2$Pre_ERVisits) ) ) # calculate estimate from bs sample
 }

 hist(bstrap) # view bs distribution
 summary(bstrap) # summary stats

 sd(bstrap, na.rm = TRUE)  # standard error
 quantile(bstrap, c(.025,.975),na.rm = TRUE)  # 95% bootstrapped CI

 bsresults <- data.frame(bstrap)

 b.under.H0 <- bsresults$bstrap - mean(bsresults$bstrap) # p-value based on bs dist
 mean(abs(b.under.H0) > abs(t0))

 rm(bstrap,treat, ctrl, sample1, sample2) # cleanup
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: basic boot.R
	# \| DATE: 11/9/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: demo R boot package and home grown bootsrapping for differences in means and difference-in-differences
	# *----------------------------------------------------------------

	#---------------------------------
	# baseline OLS model
	#---------------------------------

	# set linear model baseline
	summary(lm(ERVisits ~ treat, data = df_chrt))

	# 95% CI
	est <- -.266
	std.err <- .363
	LL = est - 1.96 * std.err
	UB = est + 1.96*std.err

	ci <- cbind(Estmiate = est, "SE" = std.err,LL,UB)
	print(ci)
	rm(est,std.err,LL,UB)

	#---------------------------------
	# roll your own bs
	#---------------------------------

	# define treatment and control groups for separate bootstraps

	treat <- df_chrt_match[df_chrt$treat == 1,]
	ctrl <- df_chrt_match[df_chrt$treat == 0,]

	t0 <- mean(treat$ERVisits) - mean(ctrl$ERVisits) # point estimate

	set.seed(1234567)
	bstrap <- c() # initialize vector for bootstrapped estimates
	for (i in 1:10000) {
	sample1 <- treat[sample(nrow(treat), nrow(treat), replace = T), ] # sample from treatment group
	sample2 <- ctrl[sample(nrow(ctrl), nrow(ctrl), replace = T), ] # sample from control group
	# sample from data with replacement (this is the bs sample)
	bstrap <- c(bstrap, (mean(sample1$ERVisits) - mean(sample2$ERVisits))) # calculate estimate from bs sample
	}

	hist(bstrap) # view bs distribution
	summary(bstrap) # summary stats

	sd(bstrap, na.rm = TRUE) # standard error
	quantile(bstrap, c(.025,.975),na.rm = TRUE) # 95% bootstrapped CI

	results <- data.frame(bstrap)

	b.under.H0 <- results$bstrap - mean(results$bstrap) # p-value based on bs dist
	mean(abs(b.under.H0) > abs(t0))

	rm(bstrap,treat, ctrl, sample1, sample2) # cleanup


	##################################################
	# compare to basic boot from boot package
	##################################################

	# function to bootstrap the dummy variable regression estimate of differences in means
	# this is different from above where we were pulling separate samples from the treatment and control groups

	diff <- function(formula, data, indices) {
	d <- data[indices,] # allows boot to select sample
	fit <- lm(formula, data=d)
	return(coef(fit)[2]) # get the coefficient for treatment (this gives mean differences)
	}

	# bootstrapping with 10000 replications
	results <- boot(data=df_chrt_match, statistic=diff,
	R=10000, formula=ERVisits ~ treat)

	# view results
	plot(results)

	# get 95% bs confidence interval
	boot.ci(results, type="all") # matches percentile method from above & same standard error we got above

	b.under.H0 <- results$t - mean(results$t)
	mean(abs(b.under.H0) > abs(results$t0)) # same p-value as above

	#-----------------------------------------
	# example for difference-in-differences via R bloggers
	#---------------------------------------

	# Reference: https://www.r-bloggers.com/bootstrapping-standard-errors-for-difference-in-differences-estimation-with-r/

	run_DiD <- function(my_data, indices){
	d <- my_data[indices,]
	return(
	mean(d$rprice[d$year==1981 & d$nearinc==1]) -
	mean(d$rprice[d$year==1981 & d$nearinc==0]) -
	(mean(d$rprice[d$year==1978 & d$nearinc==1]) -
	mean(d$rprice[d$year==1978 & d$nearinc==0]))
	)
	}

	boot_est <- boot(data, run_DiD, R=1000, parallel="multicore", ncpus = 2)

	#### using our example above: (note data was restructured
	run_DiD <- function(my_data, indices){
	d <- my_data[indices,]
	return(
	(mean(d$ERVisits[d$treat == 1]) - mean(d$ERVisits[d$treat == 1])) -
	(mean(d$ERVisits[d$treat == 0]) - mean(d$ERVisits[d$treat == 0]) )
	)
	}

	# bootstrapping with 10000 replications
	set.seed(123456)
	results <- boot(data=df_chrt, statistic=run_DiD,
	R=10000)

	print(results)

	# view results
	plot(results)

	# get 95% bs confidence interval
	boot.ci(results, type="all")

	# adhoc bs p-value
	b.under.H0 <- results$t - mean(results$t)
	mean(abs(b.under.H0) > abs(results$t0))

	#-----------------------------------------
	# roll your own
	#---------------------------------------

	treat <- df_chrt[df_chrt$treat == 1,]
	ctrl <- df_chrt[df_chrt$treat == 0,]

	t0 <- (mean(treat$Post_ERVisits) - mean(treat$Pre_ERVisits)) -
	(mean(ctrl$Post_ERVisits) - mean(ctrl$Pre_ERVisits) )

	set.seed(1234567)
	bstrap <- c() # initialize vector for bootstrapped estimates
	for (i in 1:10000) {
	sample1 <- treat[sample(nrow(treat), nrow(treat), replace = T), ] # sample from treatment group
	sample2 <- ctrl[sample(nrow(ctrl), nrow(ctrl), replace = T), ] # sample from control group
	# sample from data with replacement (this is the bs sample)
	bstrap <- c(bstrap,(mean(sample1$Post_ERVisits) - mean(sample1$Pre_ERVisits)) -
	(mean(sample2$Post_ERVisits) - mean(sample2$Pre_ERVisits) ) ) # calculate estimate from bs sample
	}

	hist(bstrap) # view bs distribution
	summary(bstrap) # summary stats

	sd(bstrap, na.rm = TRUE) # standard error
	quantile(bstrap, c(.025,.975),na.rm = TRUE) # 95% bootstrapped CI

	bsresults <- data.frame(bstrap)

	b.under.H0 <- bsresults$bstrap - mean(bsresults$bstrap) # p-value based on bs dist
	mean(abs(b.under.H0) > abs(t0))

	rm(bstrap,treat, ctrl, sample1, sample2) # cleanup