Created
October 16, 2020 23:38
-
-
Save BioSciEconomist/8b0c24ac8086b39992ada59ad8867023 to your computer and use it in GitHub Desktop.
Example of bootstrapped inference for comparing two sample means
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex bootstrapped inference.py | |
# | DATE: 10/16/20 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: example of bootstrapped inference for comparing two sample means | |
# *---------------------------------------------------------------- | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation | |
df1 = pd.read_csv('//Data//cohort.csv') | |
# define treatment and control group data for separate bootstraps | |
mask = (df1['treat'] == 1) | |
treat = df1.Cost[mask] | |
mask = (df1['treat'] == 0) | |
ctrl = df1.Cost[mask] | |
t0 =treat.mean() - ctrl.mean() # point estimate | |
# | |
# define function for creating bootstrap samples | |
# | |
def draw_bs(trt_dat,ctrl_dat,size): | |
"""creates a bootstrap sample, computes replicates and returns replicates array""" | |
# Create an empty array to store replicates | |
bs_replicates = np.empty(size) | |
# pull bootstrapped samples | |
for i in range(size): | |
# Create a bootstrap sample | |
sample1 = trt_dat.sample(frac=1, replace=True) | |
sample2 = ctrl_dat.sample(frac=1, replace=True) | |
# Get bootstrap replicate and append to bs_replicates | |
bs_replicates[i] = sample1.mean() - sample2.mean() | |
return bs_replicates | |
# draw bootstrap samples of the statistic or parameter estimate of interest | |
bs_replicates = draw_bs(treat, ctrl,5000) | |
# | |
# analysis | |
# | |
results = pd.DataFrame(columns=['SE', 'P-Value','Lower','Upper']) | |
results['SE'] = [(np.std(bs_replicates))] # standard error | |
# p-value - what fraction of bs estimates exceed t0 | |
b_under_Ho = bs_replicates - np.mean(bs_replicates) | |
results['P-Value'] = [sum(abs(b_under_Ho) > abs(t0))/5000] # implied p-value | |
results['Lower'] = np.percentile(bs_replicates,[2.5]) # 95% CI lower bound | |
results['Upper'] = np.percentile(bs_replicates,[97.5]) # 95% CI upper bound | |
print(results) | |
# | |
# benchmark against standard linear regression | |
# | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
# regression using smf | |
results = smf.ols('Cost ~ treat', data=df1).fit() | |
results.summary2() # summary2 will override scientific notation | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment