Created
March 16, 2021 13:33
-
-
Save BioSciEconomist/350dc87eab5172dbdcc680cd3341d377 to your computer and use it in GitHub Desktop.
Demonstrate the impact of multiple tests and false discovery rates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: sim multiple tests.py | |
# | DATE: 3/12/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: demonstrate the impact of multiple tests and false discovery rates | |
# *---------------------------------------------------------------- | |
# suppress sci notation | |
import pandas as pd | |
import numpy as np | |
pd.set_option('display.float_format', '{:.2f}'.format) | |
# simulate some data | |
trt= pd.DataFrame(columns=['y1','y2','D']) | |
np.random.seed(123) | |
trt['y1'] = np.random.normal(95,75,10000) | |
np.random.seed(456) | |
trt['y2'] = np.random.normal(100,75,10000) | |
trt['D'] = [1]*10000 | |
trt['ID'] = list(range(1,10001)) | |
ctrl= pd.DataFrame(columns=['y1','y2','D']) | |
np.random.seed(123) | |
ctrl['y1'] = np.random.normal(95,75,10000) | |
np.random.seed(456) | |
ctrl['y2'] = np.random.normal(100,75,10000) | |
ctrl['D'] = [0]*10000 | |
ctrl['ID'] = list(range(10001,20001)) | |
df = pd.concat([trt,ctrl],ignore_index=True) | |
df.head() | |
df.tail() | |
df.describe() | |
df.shape | |
df.groupby('D')['y1'].mean() | |
df.groupby('D')['y2'].mean() | |
# the treatment variable should be insignificant | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
# regression using smf | |
results = smf.ols('y1 ~ D', data=df).fit() | |
results.summary2() | |
results = smf.ols('y2 ~ D', data=df).fit() | |
results.summary2() | |
#------------------------------------- | |
# power analysis | |
#------------------------------------ | |
# how large of a sampel do we need to have a 95% probability | |
# of detecting a difference of 5 units with a 5% level of significance | |
# (false positive) | |
import statsmodels | |
from statsmodels.stats import power | |
diff = 5 # this is the size of the impact we want to measure | |
sigmaA = 75 # standard deviation for group A | |
sigmaB = 75# standard deviation for group B (for now assume common variance between groups) | |
Q0 = .5 # proportion in the control group (.5 implies equal sized groups) | |
sigma_pooled = np.sqrt((np.square(sigmaA)*(1/Q0 -1 ) + np.square(sigmaB)*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation | |
# | |
d = abs(diff)/sigma_pooled # calculate effect size | |
print(sigma_pooled) | |
print(diff,d) | |
# calcualte required sample size per group | |
statsmodels.stats.power.tt_ind_solve_power(effect_size= d, nobs1=None, alpha=.05, power= .95, ratio=1.0, alternative='two-sided') | |
# let's round this to 6000 | |
#--------------------------------- | |
# test on a well powered random sample | |
#-------------------------------- | |
np.random.seed(123) | |
sample_t = trt.sample(6000, replace=True) | |
np.random.seed(123) | |
sample_c = ctrl.sample(6000, replace=True) | |
# stack data | |
tmp = pd.concat([sample_t,sample_c],ignore_index=True) | |
tmp.shape | |
tmp.head() | |
tmp.describe() | |
# test impact of treatment 'D' on outcomes y1 and y2 | |
# regression using smf | |
results = smf.ols('y1 ~ D', data=tmp).fit() | |
results.summary2() | |
results = smf.ols('y2 ~ D', data=tmp).fit() | |
results.summary2() | |
# we can conclude there is no impact | |
#----------------------------------- | |
# demonstrate that the 5% false positive rate for a single test | |
#---------------------------------- | |
def run_tests(trt_dat,ctrl_dat,size): | |
"""creates a bootstrap sample, computes replicates and returns replicates array""" | |
# Create an empty array to store replicates | |
pvals = np.empty(size) | |
seeds = np.empty(size) | |
# pull bootstrapped samples | |
for i in range(size): | |
# Create a bootstrap sample | |
np.random.seed(i) | |
sample_t = trt_dat.sample(6000, replace=True) | |
np.random.seed(i + 1) | |
sample_c = ctrl_dat.sample(6000, replace=True) | |
tmp = pd.concat([sample_t,sample_c],ignore_index=True) | |
# fit model | |
model = smf.ols('y1 ~ D', data=tmp).fit() | |
#print(model.params) | |
#print(model.pvalues.loc['D']) | |
pvals[i] = model.pvalues.loc['D'] | |
#print(tmp.y1.mean()) | |
seeds[i] = i | |
return pvals, seeds | |
results = run_tests(trt,ctrl,100) | |
report = pd.DataFrame(columns=['seed','pval','sig']) | |
report['seed'] = results[1] | |
report['pval'] = results[0] | |
# flag the false positives (p-val < .05) | |
report['sig'] = np.where(report['pval'] < .05, 1,0) | |
report.head() | |
report.describe() # the 5% we expect | |
# subset the cases that were false positives | |
tmp = report[report.sig == 1] | |
print(tmp) | |
#-------------------------------- | |
# replicate a sample that gives a false positive | |
#------------------------------- | |
# can we replicate sample with seed = 27 | |
np.random.seed(27) | |
sample_t = trt.sample(6000, replace=True) | |
np.random.seed(28) | |
sample_c = ctrl.sample(6000, replace=True) | |
# stack data | |
tmp = pd.concat([sample_t,sample_c],ignore_index=True) | |
tmp.groupby('D')['y1'].mean() | |
# test impact of treatment 'D' on outcomes y1 and y2 | |
# regression using smf | |
results = smf.ols('y1 ~ D', data=tmp).fit() | |
results.summary2() | |
#-------------------------------- | |
# analysis with 2 outcomes | |
#------------------------------- | |
def run_tests(trt_dat,ctrl_dat,size): | |
"""creates a bootstrap sample, computes replicates and returns replicates array""" | |
# Create an empty array to store replicates | |
pvals1 = np.empty(size) | |
pvals2 = np.empty(size) | |
seeds = np.empty(size) | |
# pull bootstrapped samples | |
for i in range(size): | |
# Create a bootstrap sample | |
np.random.seed(i) | |
sample_t = trt_dat.sample(6000, replace=True) | |
np.random.seed(i + 1) | |
sample_c = ctrl_dat.sample(6000, replace=True) | |
tmp = pd.concat([sample_t,sample_c],ignore_index=True) | |
# fit model | |
model1 = smf.ols('y1 ~ D', data=tmp).fit() | |
pvals1[i] = model1.pvalues.loc['D'] | |
# fit 2nd model | |
model2 = smf.ols('y2 ~ D', data=tmp).fit() | |
pvals2[i] = model2.pvalues.loc['D'] | |
#print(tmp.y1.mean()) | |
seeds[i] = i | |
return pvals1,pvals2, seeds | |
results = run_tests(trt,ctrl,500) | |
report = pd.DataFrame(columns=['seed','pvals1','pvals2','sig']) | |
report['seed'] = results[2] | |
report['pvals1'] = results[0] | |
report['pvals2'] = results[1] | |
# flag the false positives (p-val < .05) | |
report['sig'] = np.where((report['pvals1'] < .05) | (report['pvals2'] < .05), 1,0) | |
report.describe() # the 10% we expect | |
# subset the cases that were false positives | |
tmp = report[report.sig == 1] | |
tmp.head(20) | |
tmp.tail(10) | |
tmp.shape | |
# what if we adjusted our p-value | |
# flag the false positives (p-val < .05) | |
report['sig_adj'] = np.where((report['pvals1'] < .025) | (report['pvals2'] < .025), 1,0) | |
report.describe() | |
#----------------------------- | |
# scenario 1 | |
#---------------------------- | |
np.random.seed(162) | |
sample_t = trt.sample(6000, replace=True) | |
np.random.seed(163) | |
sample_c = ctrl.sample(6000, replace=True) | |
# stack data | |
tmp = pd.concat([sample_t,sample_c],ignore_index=True) | |
tmp.head() | |
tmp.describe() | |
# test impact of treatment 'D' on outcomes y1 and y2 | |
# regression using smf | |
results = smf.ols('y1 ~ D', data=tmp).fit() | |
results.summary2() | |
results = smf.ols('y2 ~ D', data=tmp).fit() | |
results.summary2() | |
#----------------------------- | |
# scenario 2 | |
#---------------------------- | |
# can we replicate sample with seed = 27 | |
np.random.seed(63) | |
sample_t = trt.sample(6000, replace=True) | |
np.random.seed(64) | |
sample_c = ctrl.sample(6000, replace=True) | |
# stack data | |
tmp = pd.concat([sample_t,sample_c],ignore_index=True) | |
tmp.head() | |
tmp.describe() | |
# test impact of treatment 'D' on outcomes y1 and y2 | |
# regression using smf | |
results = smf.ols('y1 ~ D', data=tmp).fit() | |
results.summary2() | |
results = smf.ols('y2 ~ D', data=tmp).fit() | |
results.summary2() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment