Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created October 16, 2020 23:38
Show Gist options
  • Save BioSciEconomist/8b0c24ac8086b39992ada59ad8867023 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/8b0c24ac8086b39992ada59ad8867023 to your computer and use it in GitHub Desktop.
Example of bootstrapped inference for comparing two sample means
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex bootstrapped inference.py
# | DATE: 10/16/20
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: example of bootstrapped inference for comparing two sample means
# *----------------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
df1 = pd.read_csv('//Data//cohort.csv')
# define treatment and control group data for separate bootstraps
mask = (df1['treat'] == 1)
treat = df1.Cost[mask]
mask = (df1['treat'] == 0)
ctrl = df1.Cost[mask]
t0 =treat.mean() - ctrl.mean() # point estimate
#
# define function for creating bootstrap samples
#
def draw_bs(trt_dat,ctrl_dat,size):
"""creates a bootstrap sample, computes replicates and returns replicates array"""
# Create an empty array to store replicates
bs_replicates = np.empty(size)
# pull bootstrapped samples
for i in range(size):
# Create a bootstrap sample
sample1 = trt_dat.sample(frac=1, replace=True)
sample2 = ctrl_dat.sample(frac=1, replace=True)
# Get bootstrap replicate and append to bs_replicates
bs_replicates[i] = sample1.mean() - sample2.mean()
return bs_replicates
# draw bootstrap samples of the statistic or parameter estimate of interest
bs_replicates = draw_bs(treat, ctrl,5000)
#
# analysis
#
results = pd.DataFrame(columns=['SE', 'P-Value','Lower','Upper'])
results['SE'] = [(np.std(bs_replicates))] # standard error
# p-value - what fraction of bs estimates exceed t0
b_under_Ho = bs_replicates - np.mean(bs_replicates)
results['P-Value'] = [sum(abs(b_under_Ho) > abs(t0))/5000] # implied p-value
results['Lower'] = np.percentile(bs_replicates,[2.5]) # 95% CI lower bound
results['Upper'] = np.percentile(bs_replicates,[97.5]) # 95% CI upper bound
print(results)
#
# benchmark against standard linear regression
#
import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation
# regression using smf
results = smf.ols('Cost ~ treat', data=df1).fit()
results.summary2() # summary2 will override scientific notation
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment