Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created March 16, 2021 13:33
Show Gist options
  • Save BioSciEconomist/350dc87eab5172dbdcc680cd3341d377 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/350dc87eab5172dbdcc680cd3341d377 to your computer and use it in GitHub Desktop.
Demonstrate the impact of multiple tests and false discovery rates
# *-----------------------------------------------------------------
# | PROGRAM NAME: sim multiple tests.py
# | DATE: 3/12/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: demonstrate the impact of multiple tests and false discovery rates
# *----------------------------------------------------------------
# suppress sci notation
import pandas as pd
import numpy as np
pd.set_option('display.float_format', '{:.2f}'.format)
# simulate some data
trt= pd.DataFrame(columns=['y1','y2','D'])
np.random.seed(123)
trt['y1'] = np.random.normal(95,75,10000)
np.random.seed(456)
trt['y2'] = np.random.normal(100,75,10000)
trt['D'] = [1]*10000
trt['ID'] = list(range(1,10001))
ctrl= pd.DataFrame(columns=['y1','y2','D'])
np.random.seed(123)
ctrl['y1'] = np.random.normal(95,75,10000)
np.random.seed(456)
ctrl['y2'] = np.random.normal(100,75,10000)
ctrl['D'] = [0]*10000
ctrl['ID'] = list(range(10001,20001))
df = pd.concat([trt,ctrl],ignore_index=True)
df.head()
df.tail()
df.describe()
df.shape
df.groupby('D')['y1'].mean()
df.groupby('D')['y2'].mean()
# the treatment variable should be insignificant
import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation
# regression using smf
results = smf.ols('y1 ~ D', data=df).fit()
results.summary2()
results = smf.ols('y2 ~ D', data=df).fit()
results.summary2()
#-------------------------------------
# power analysis
#------------------------------------
# how large of a sampel do we need to have a 95% probability
# of detecting a difference of 5 units with a 5% level of significance
# (false positive)
import statsmodels
from statsmodels.stats import power
diff = 5 # this is the size of the impact we want to measure
sigmaA = 75 # standard deviation for group A
sigmaB = 75# standard deviation for group B (for now assume common variance between groups)
Q0 = .5 # proportion in the control group (.5 implies equal sized groups)
sigma_pooled = np.sqrt((np.square(sigmaA)*(1/Q0 -1 ) + np.square(sigmaB)*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation
#
d = abs(diff)/sigma_pooled # calculate effect size
print(sigma_pooled)
print(diff,d)
# calcualte required sample size per group
statsmodels.stats.power.tt_ind_solve_power(effect_size= d, nobs1=None, alpha=.05, power= .95, ratio=1.0, alternative='two-sided')
# let's round this to 6000
#---------------------------------
# test on a well powered random sample
#--------------------------------
np.random.seed(123)
sample_t = trt.sample(6000, replace=True)
np.random.seed(123)
sample_c = ctrl.sample(6000, replace=True)
# stack data
tmp = pd.concat([sample_t,sample_c],ignore_index=True)
tmp.shape
tmp.head()
tmp.describe()
# test impact of treatment 'D' on outcomes y1 and y2
# regression using smf
results = smf.ols('y1 ~ D', data=tmp).fit()
results.summary2()
results = smf.ols('y2 ~ D', data=tmp).fit()
results.summary2()
# we can conclude there is no impact
#-----------------------------------
# demonstrate that the 5% false positive rate for a single test
#----------------------------------
def run_tests(trt_dat,ctrl_dat,size):
"""creates a bootstrap sample, computes replicates and returns replicates array"""
# Create an empty array to store replicates
pvals = np.empty(size)
seeds = np.empty(size)
# pull bootstrapped samples
for i in range(size):
# Create a bootstrap sample
np.random.seed(i)
sample_t = trt_dat.sample(6000, replace=True)
np.random.seed(i + 1)
sample_c = ctrl_dat.sample(6000, replace=True)
tmp = pd.concat([sample_t,sample_c],ignore_index=True)
# fit model
model = smf.ols('y1 ~ D', data=tmp).fit()
#print(model.params)
#print(model.pvalues.loc['D'])
pvals[i] = model.pvalues.loc['D']
#print(tmp.y1.mean())
seeds[i] = i
return pvals, seeds
results = run_tests(trt,ctrl,100)
report = pd.DataFrame(columns=['seed','pval','sig'])
report['seed'] = results[1]
report['pval'] = results[0]
# flag the false positives (p-val < .05)
report['sig'] = np.where(report['pval'] < .05, 1,0)
report.head()
report.describe() # the 5% we expect
# subset the cases that were false positives
tmp = report[report.sig == 1]
print(tmp)
#--------------------------------
# replicate a sample that gives a false positive
#-------------------------------
# can we replicate sample with seed = 27
np.random.seed(27)
sample_t = trt.sample(6000, replace=True)
np.random.seed(28)
sample_c = ctrl.sample(6000, replace=True)
# stack data
tmp = pd.concat([sample_t,sample_c],ignore_index=True)
tmp.groupby('D')['y1'].mean()
# test impact of treatment 'D' on outcomes y1 and y2
# regression using smf
results = smf.ols('y1 ~ D', data=tmp).fit()
results.summary2()
#--------------------------------
# analysis with 2 outcomes
#-------------------------------
def run_tests(trt_dat,ctrl_dat,size):
"""creates a bootstrap sample, computes replicates and returns replicates array"""
# Create an empty array to store replicates
pvals1 = np.empty(size)
pvals2 = np.empty(size)
seeds = np.empty(size)
# pull bootstrapped samples
for i in range(size):
# Create a bootstrap sample
np.random.seed(i)
sample_t = trt_dat.sample(6000, replace=True)
np.random.seed(i + 1)
sample_c = ctrl_dat.sample(6000, replace=True)
tmp = pd.concat([sample_t,sample_c],ignore_index=True)
# fit model
model1 = smf.ols('y1 ~ D', data=tmp).fit()
pvals1[i] = model1.pvalues.loc['D']
# fit 2nd model
model2 = smf.ols('y2 ~ D', data=tmp).fit()
pvals2[i] = model2.pvalues.loc['D']
#print(tmp.y1.mean())
seeds[i] = i
return pvals1,pvals2, seeds
results = run_tests(trt,ctrl,500)
report = pd.DataFrame(columns=['seed','pvals1','pvals2','sig'])
report['seed'] = results[2]
report['pvals1'] = results[0]
report['pvals2'] = results[1]
# flag the false positives (p-val < .05)
report['sig'] = np.where((report['pvals1'] < .05) | (report['pvals2'] < .05), 1,0)
report.describe() # the 10% we expect
# subset the cases that were false positives
tmp = report[report.sig == 1]
tmp.head(20)
tmp.tail(10)
tmp.shape
# what if we adjusted our p-value
# flag the false positives (p-val < .05)
report['sig_adj'] = np.where((report['pvals1'] < .025) | (report['pvals2'] < .025), 1,0)
report.describe()
#-----------------------------
# scenario 1
#----------------------------
np.random.seed(162)
sample_t = trt.sample(6000, replace=True)
np.random.seed(163)
sample_c = ctrl.sample(6000, replace=True)
# stack data
tmp = pd.concat([sample_t,sample_c],ignore_index=True)
tmp.head()
tmp.describe()
# test impact of treatment 'D' on outcomes y1 and y2
# regression using smf
results = smf.ols('y1 ~ D', data=tmp).fit()
results.summary2()
results = smf.ols('y2 ~ D', data=tmp).fit()
results.summary2()
#-----------------------------
# scenario 2
#----------------------------
# can we replicate sample with seed = 27
np.random.seed(63)
sample_t = trt.sample(6000, replace=True)
np.random.seed(64)
sample_c = ctrl.sample(6000, replace=True)
# stack data
tmp = pd.concat([sample_t,sample_c],ignore_index=True)
tmp.head()
tmp.describe()
# test impact of treatment 'D' on outcomes y1 and y2
# regression using smf
results = smf.ols('y1 ~ D', data=tmp).fit()
results.summary2()
results = smf.ols('y2 ~ D', data=tmp).fit()
results.summary2()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment