BioSciEconomist · October 16, 2020 23:38
diff --git a/ex bootstrapped inference.py b/ex bootstrapped inference.py
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex bootstrapped inference.py
 # | DATE: 10/16/20
 # | CREATED BY: MATT BOGARD
 # | PROJECT FILE:         
 # *----------------------------------------------------------------
 # | PURPOSE: example of bootstrapped inference for comparing two sample means
 # *----------------------------------------------------------------
 
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 
 pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
 
 
 df1 = pd.read_csv('//Data//cohort.csv')
 
 
 # define treatment and control group data for separate bootstraps
 mask = (df1['treat'] == 1)
 treat = df1.Cost[mask]
 
 mask = (df1['treat'] == 0)
 ctrl = df1.Cost[mask]
 
 t0 =treat.mean() - ctrl.mean() # point estimate
 
 
 #
 # define function for creating bootstrap samples
 #
 
 def draw_bs(trt_dat,ctrl_dat,size):
    """creates a bootstrap sample, computes replicates and returns replicates array"""
    # Create an empty array to store replicates
    bs_replicates = np.empty(size)
   
    # pull bootstrapped samples
    for i in range(size):
        # Create a bootstrap sample
        sample1 = trt_dat.sample(frac=1, replace=True)
        sample2 = ctrl_dat.sample(frac=1, replace=True)
        # Get bootstrap replicate and append to bs_replicates
        bs_replicates[i] = sample1.mean() - sample2.mean()
   
    return bs_replicates
 
 # draw bootstrap samples of the statistic or parameter estimate of interest
 bs_replicates = draw_bs(treat, ctrl,5000)
 
  
 #
 # analysis
 #
 
 results = pd.DataFrame(columns=['SE', 'P-Value','Lower','Upper'])
 results['SE'] = [(np.std(bs_replicates))] # standard error
 # p-value - what fraction of bs estimates exceed t0
 b_under_Ho = bs_replicates - np.mean(bs_replicates)
 results['P-Value'] = [sum(abs(b_under_Ho) > abs(t0))/5000] # implied p-value
 results['Lower'] = np.percentile(bs_replicates,[2.5]) # 95% CI lower bound
 results['Upper'] = np.percentile(bs_replicates,[97.5]) # 95% CI upper bound
 
 print(results)
 
 #
 # benchmark against standard linear regression
 #

 import statsmodels.api as sm # import stastmodels
 import statsmodels.formula.api as smf # this allows us to use an explicit formulation

 # regression using smf
 results = smf.ols('Cost ~ treat', data=df1).fit()
 results.summary2() # summary2 will override scientific notation
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex bootstrapped inference.py
	# \| DATE: 10/16/20
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: example of bootstrapped inference for comparing two sample means
	# *----------------------------------------------------------------

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation


	df1 = pd.read_csv('//Data//cohort.csv')


	# define treatment and control group data for separate bootstraps
	mask = (df1['treat'] == 1)
	treat = df1.Cost[mask]

	mask = (df1['treat'] == 0)
	ctrl = df1.Cost[mask]

	t0 =treat.mean() - ctrl.mean() # point estimate


	#
	# define function for creating bootstrap samples
	#

	def draw_bs(trt_dat,ctrl_dat,size):
	"""creates a bootstrap sample, computes replicates and returns replicates array"""
	# Create an empty array to store replicates
	bs_replicates = np.empty(size)

	# pull bootstrapped samples
	for i in range(size):
	# Create a bootstrap sample
	sample1 = trt_dat.sample(frac=1, replace=True)
	sample2 = ctrl_dat.sample(frac=1, replace=True)
	# Get bootstrap replicate and append to bs_replicates
	bs_replicates[i] = sample1.mean() - sample2.mean()

	return bs_replicates

	# draw bootstrap samples of the statistic or parameter estimate of interest
	bs_replicates = draw_bs(treat, ctrl,5000)


	#
	# analysis
	#

	results = pd.DataFrame(columns=['SE', 'P-Value','Lower','Upper'])
	results['SE'] = [(np.std(bs_replicates))] # standard error
	# p-value - what fraction of bs estimates exceed t0
	b_under_Ho = bs_replicates - np.mean(bs_replicates)
	results['P-Value'] = [sum(abs(b_under_Ho) > abs(t0))/5000] # implied p-value
	results['Lower'] = np.percentile(bs_replicates,[2.5]) # 95% CI lower bound
	results['Upper'] = np.percentile(bs_replicates,[97.5]) # 95% CI upper bound

	print(results)

	#
	# benchmark against standard linear regression
	#

	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation

	# regression using smf
	results = smf.ols('Cost ~ treat', data=df1).fit()
	results.summary2() # summary2 will override scientific notation