BioSciEconomist · March 16, 2021 13:33
diff --git a/sim multiple tests.py b/sim multiple tests.py

 # *-----------------------------------------------------------------
 # | PROGRAM NAME: sim multiple tests.py
 # | DATE: 3/12/21
 # | CREATED BY: MATT BOGARD
 # | PROJECT FILE:
 # *----------------------------------------------------------------
 # | PURPOSE: demonstrate the impact of multiple tests and false discovery rates
 # *----------------------------------------------------------------


 # suppress sci notation


 import pandas as pd
 import numpy as np


 pd.set_option('display.float_format', '{:.2f}'.format) 

 # simulate some data
 trt= pd.DataFrame(columns=['y1','y2','D'])
 np.random.seed(123)
 trt['y1'] = np.random.normal(95,75,10000)
 np.random.seed(456)
 trt['y2'] = np.random.normal(100,75,10000)
 trt['D'] = [1]*10000
 trt['ID'] = list(range(1,10001))


 ctrl= pd.DataFrame(columns=['y1','y2','D'])
 np.random.seed(123)
 ctrl['y1'] = np.random.normal(95,75,10000)
 np.random.seed(456)
 ctrl['y2'] = np.random.normal(100,75,10000)
 ctrl['D'] = [0]*10000
 ctrl['ID'] = list(range(10001,20001))

 df = pd.concat([trt,ctrl],ignore_index=True)



 df.head()
 df.tail()
 df.describe()
 df.shape


 df.groupby('D')['y1'].mean()
 df.groupby('D')['y2'].mean()  

 # the treatment variable should be insignificant

 import statsmodels.api as sm # import stastmodels
 import statsmodels.formula.api as smf # this allows us to use an explicit formulation

 # regression using smf
 results = smf.ols('y1 ~ D', data=df).fit()
 results.summary2() 

 results = smf.ols('y2 ~ D', data=df).fit()
 results.summary2() 

 #-------------------------------------
 # power analysis
 #------------------------------------

 # how large of a sampel do we need to have a 95% probability 
 # of detecting a difference of 5 units with a 5% level of significance
 # (false positive)


 import statsmodels
 from statsmodels.stats import power

 diff = 5 # this is the size of the impact we want to measure
 sigmaA = 75 # standard deviation for group A
 sigmaB = 75# standard deviation for group B (for now assume common variance between groups)
 Q0 = .5 # proportion in the control group (.5 implies equal sized groups)

 sigma_pooled = np.sqrt((np.square(sigmaA)*(1/Q0 -1 ) + np.square(sigmaB)*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation

              # 
 d = abs(diff)/sigma_pooled # calculate effect size

 print(sigma_pooled)
 print(diff,d)

 # calcualte required sample size per group
 statsmodels.stats.power.tt_ind_solve_power(effect_size= d, nobs1=None, alpha=.05, power= .95, ratio=1.0, alternative='two-sided')

 # let's round this to 6000

 #---------------------------------
 # test on a well powered random sample
 #--------------------------------

 np.random.seed(123)
 sample_t = trt.sample(6000, replace=True)
 np.random.seed(123)
 sample_c = ctrl.sample(6000, replace=True)

 # stack data
 tmp = pd.concat([sample_t,sample_c],ignore_index=True)

 tmp.shape
 tmp.head()
 tmp.describe()

 # test impact of treatment 'D' on outcomes y1 and y2
 # regression using smf
 results = smf.ols('y1 ~ D', data=tmp).fit()
 results.summary2() 

 results = smf.ols('y2 ~ D', data=tmp).fit()
 results.summary2()




 # we can conclude there is no impact

 #-----------------------------------
 # demonstrate that the 5% false positive rate for a single test
 #----------------------------------

 def run_tests(trt_dat,ctrl_dat,size):
    """creates a bootstrap sample, computes replicates and returns replicates array"""
    # Create an empty array to store replicates
    pvals = np.empty(size)
    seeds = np.empty(size)
   
    # pull bootstrapped samples
    for i in range(size):
        # Create a bootstrap sample
        np.random.seed(i)
        sample_t = trt_dat.sample(6000, replace=True)
        np.random.seed(i + 1)
        sample_c = ctrl_dat.sample(6000, replace=True)
        tmp = pd.concat([sample_t,sample_c],ignore_index=True)
        # fit model
        model = smf.ols('y1 ~ D', data=tmp).fit()
        #print(model.params)
        #print(model.pvalues.loc['D'])
        pvals[i] = model.pvalues.loc['D']
        #print(tmp.y1.mean())
        seeds[i] = i

    return pvals, seeds
 
 results = run_tests(trt,ctrl,100)


 report = pd.DataFrame(columns=['seed','pval','sig'])
 report['seed'] = results[1] 
 report['pval'] = results[0]

 # flag the false positives (p-val < .05)
 report['sig'] = np.where(report['pval'] < .05, 1,0)

 report.head()
 
 report.describe()  # the 5% we expect
 
 # subset the cases that were false positives
 tmp = report[report.sig == 1]

 print(tmp)


 #--------------------------------
 # replicate a sample that gives a false positive
 #-------------------------------

 # can we replicate sample with seed = 27

 np.random.seed(27)
 sample_t = trt.sample(6000, replace=True)
 np.random.seed(28)
 sample_c = ctrl.sample(6000, replace=True)

 # stack data
 tmp = pd.concat([sample_t,sample_c],ignore_index=True)

 tmp.groupby('D')['y1'].mean()

 # test impact of treatment 'D' on outcomes y1 and y2
 # regression using smf
 results = smf.ols('y1 ~ D', data=tmp).fit()
 results.summary2() 

 #--------------------------------
 # analysis with 2 outcomes
 #-------------------------------


 def run_tests(trt_dat,ctrl_dat,size):
    """creates a bootstrap sample, computes replicates and returns replicates array"""
    # Create an empty array to store replicates
    pvals1 = np.empty(size)
    pvals2 = np.empty(size)
    seeds = np.empty(size)
   
    # pull bootstrapped samples
    for i in range(size):
        # Create a bootstrap sample
        np.random.seed(i)
        sample_t = trt_dat.sample(6000, replace=True)
        np.random.seed(i + 1)
        sample_c = ctrl_dat.sample(6000, replace=True)
        tmp = pd.concat([sample_t,sample_c],ignore_index=True)
        # fit model
        model1 = smf.ols('y1 ~ D', data=tmp).fit()
        pvals1[i] = model1.pvalues.loc['D']
        # fit 2nd model
        model2 = smf.ols('y2 ~ D', data=tmp).fit()
        pvals2[i] = model2.pvalues.loc['D']
        #print(tmp.y1.mean())
        seeds[i] = i

    return pvals1,pvals2, seeds
 
 results = run_tests(trt,ctrl,500)


 report = pd.DataFrame(columns=['seed','pvals1','pvals2','sig'])
 report['seed'] = results[2] 
 report['pvals1'] = results[0]
 report['pvals2'] = results[1]

 # flag the false positives (p-val < .05)
 report['sig'] = np.where((report['pvals1'] < .05) | (report['pvals2'] < .05), 1,0)
 
 report.describe()  # the 10% we expect
 
 # subset the cases that were false positives
 tmp = report[report.sig == 1]

 tmp.head(20)
 tmp.tail(10)
 tmp.shape


 # what if we adjusted our p-value
 # flag the false positives (p-val < .05)
 report['sig_adj'] = np.where((report['pvals1'] < .025) | (report['pvals2'] < .025), 1,0)

 report.describe()


 #-----------------------------
 # scenario 1
 #----------------------------



 np.random.seed(162)
 sample_t = trt.sample(6000, replace=True)
 np.random.seed(163)
 sample_c = ctrl.sample(6000, replace=True)

 # stack data
 tmp = pd.concat([sample_t,sample_c],ignore_index=True)

 tmp.head()
 tmp.describe()

 # test impact of treatment 'D' on outcomes y1 and y2
 # regression using smf
 results = smf.ols('y1 ~ D', data=tmp).fit()
 results.summary2() 

 results = smf.ols('y2 ~ D', data=tmp).fit()
 results.summary2() 

 #-----------------------------
 # scenario 2
 #----------------------------

 # can we replicate sample with seed = 27

 np.random.seed(63)
 sample_t = trt.sample(6000, replace=True)
 np.random.seed(64)
 sample_c = ctrl.sample(6000, replace=True)

 # stack data
 tmp = pd.concat([sample_t,sample_c],ignore_index=True)

 tmp.head()
 tmp.describe()

 # test impact of treatment 'D' on outcomes y1 and y2
 # regression using smf
 results = smf.ols('y1 ~ D', data=tmp).fit()
 results.summary2() 

 results = smf.ols('y2 ~ D', data=tmp).fit()
 results.summary2()

	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: sim multiple tests.py
	# \| DATE: 3/12/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: demonstrate the impact of multiple tests and false discovery rates
	# *----------------------------------------------------------------


	# suppress sci notation


	import pandas as pd
	import numpy as np


	pd.set_option('display.float_format', '{:.2f}'.format)

	# simulate some data
	trt= pd.DataFrame(columns=['y1','y2','D'])
	np.random.seed(123)
	trt['y1'] = np.random.normal(95,75,10000)
	np.random.seed(456)
	trt['y2'] = np.random.normal(100,75,10000)
	trt['D'] = [1]*10000
	trt['ID'] = list(range(1,10001))


	ctrl= pd.DataFrame(columns=['y1','y2','D'])
	np.random.seed(123)
	ctrl['y1'] = np.random.normal(95,75,10000)
	np.random.seed(456)
	ctrl['y2'] = np.random.normal(100,75,10000)
	ctrl['D'] = [0]*10000
	ctrl['ID'] = list(range(10001,20001))

	df = pd.concat([trt,ctrl],ignore_index=True)



	df.head()
	df.tail()
	df.describe()
	df.shape


	df.groupby('D')['y1'].mean()
	df.groupby('D')['y2'].mean()

	# the treatment variable should be insignificant

	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation

	# regression using smf
	results = smf.ols('y1 ~ D', data=df).fit()
	results.summary2()

	results = smf.ols('y2 ~ D', data=df).fit()
	results.summary2()

	#-------------------------------------
	# power analysis
	#------------------------------------

	# how large of a sampel do we need to have a 95% probability
	# of detecting a difference of 5 units with a 5% level of significance
	# (false positive)


	import statsmodels
	from statsmodels.stats import power

	diff = 5 # this is the size of the impact we want to measure
	sigmaA = 75 # standard deviation for group A
	sigmaB = 75# standard deviation for group B (for now assume common variance between groups)
	Q0 = .5 # proportion in the control group (.5 implies equal sized groups)

	sigma_pooled = np.sqrt((np.square(sigmaA)(1/Q0 -1 ) + np.square(sigmaB)(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) # pooled standard deviation

	#
	d = abs(diff)/sigma_pooled # calculate effect size

	print(sigma_pooled)
	print(diff,d)

	# calcualte required sample size per group
	statsmodels.stats.power.tt_ind_solve_power(effect_size= d, nobs1=None, alpha=.05, power= .95, ratio=1.0, alternative='two-sided')

	# let's round this to 6000

	#---------------------------------
	# test on a well powered random sample
	#--------------------------------

	np.random.seed(123)
	sample_t = trt.sample(6000, replace=True)
	np.random.seed(123)
	sample_c = ctrl.sample(6000, replace=True)

	# stack data
	tmp = pd.concat([sample_t,sample_c],ignore_index=True)

	tmp.shape
	tmp.head()
	tmp.describe()

	# test impact of treatment 'D' on outcomes y1 and y2
	# regression using smf
	results = smf.ols('y1 ~ D', data=tmp).fit()
	results.summary2()

	results = smf.ols('y2 ~ D', data=tmp).fit()
	results.summary2()




	# we can conclude there is no impact

	#-----------------------------------
	# demonstrate that the 5% false positive rate for a single test
	#----------------------------------

	def run_tests(trt_dat,ctrl_dat,size):
	"""creates a bootstrap sample, computes replicates and returns replicates array"""
	# Create an empty array to store replicates
	pvals = np.empty(size)
	seeds = np.empty(size)

	# pull bootstrapped samples
	for i in range(size):
	# Create a bootstrap sample
	np.random.seed(i)
	sample_t = trt_dat.sample(6000, replace=True)
	np.random.seed(i + 1)
	sample_c = ctrl_dat.sample(6000, replace=True)
	tmp = pd.concat([sample_t,sample_c],ignore_index=True)
	# fit model
	model = smf.ols('y1 ~ D', data=tmp).fit()
	#print(model.params)
	#print(model.pvalues.loc['D'])
	pvals[i] = model.pvalues.loc['D']
	#print(tmp.y1.mean())
	seeds[i] = i

	return pvals, seeds

	results = run_tests(trt,ctrl,100)


	report = pd.DataFrame(columns=['seed','pval','sig'])
	report['seed'] = results[1]
	report['pval'] = results[0]

	# flag the false positives (p-val < .05)
	report['sig'] = np.where(report['pval'] < .05, 1,0)

	report.head()

	report.describe() # the 5% we expect

	# subset the cases that were false positives
	tmp = report[report.sig == 1]

	print(tmp)


	#--------------------------------
	# replicate a sample that gives a false positive
	#-------------------------------

	# can we replicate sample with seed = 27

	np.random.seed(27)
	sample_t = trt.sample(6000, replace=True)
	np.random.seed(28)
	sample_c = ctrl.sample(6000, replace=True)

	# stack data
	tmp = pd.concat([sample_t,sample_c],ignore_index=True)

	tmp.groupby('D')['y1'].mean()

	# test impact of treatment 'D' on outcomes y1 and y2
	# regression using smf
	results = smf.ols('y1 ~ D', data=tmp).fit()
	results.summary2()

	#--------------------------------
	# analysis with 2 outcomes
	#-------------------------------


	def run_tests(trt_dat,ctrl_dat,size):
	"""creates a bootstrap sample, computes replicates and returns replicates array"""
	# Create an empty array to store replicates
	pvals1 = np.empty(size)
	pvals2 = np.empty(size)
	seeds = np.empty(size)

	# pull bootstrapped samples
	for i in range(size):
	# Create a bootstrap sample
	np.random.seed(i)
	sample_t = trt_dat.sample(6000, replace=True)
	np.random.seed(i + 1)
	sample_c = ctrl_dat.sample(6000, replace=True)
	tmp = pd.concat([sample_t,sample_c],ignore_index=True)
	# fit model
	model1 = smf.ols('y1 ~ D', data=tmp).fit()
	pvals1[i] = model1.pvalues.loc['D']
	# fit 2nd model
	model2 = smf.ols('y2 ~ D', data=tmp).fit()
	pvals2[i] = model2.pvalues.loc['D']
	#print(tmp.y1.mean())
	seeds[i] = i

	return pvals1,pvals2, seeds

	results = run_tests(trt,ctrl,500)


	report = pd.DataFrame(columns=['seed','pvals1','pvals2','sig'])
	report['seed'] = results[2]
	report['pvals1'] = results[0]
	report['pvals2'] = results[1]

	# flag the false positives (p-val < .05)
	report['sig'] = np.where((report['pvals1'] < .05) \| (report['pvals2'] < .05), 1,0)

	report.describe() # the 10% we expect

	# subset the cases that were false positives
	tmp = report[report.sig == 1]

	tmp.head(20)
	tmp.tail(10)
	tmp.shape


	# what if we adjusted our p-value
	# flag the false positives (p-val < .05)
	report['sig_adj'] = np.where((report['pvals1'] < .025) \| (report['pvals2'] < .025), 1,0)

	report.describe()


	#-----------------------------
	# scenario 1
	#----------------------------



	np.random.seed(162)
	sample_t = trt.sample(6000, replace=True)
	np.random.seed(163)
	sample_c = ctrl.sample(6000, replace=True)

	# stack data
	tmp = pd.concat([sample_t,sample_c],ignore_index=True)

	tmp.head()
	tmp.describe()

	# test impact of treatment 'D' on outcomes y1 and y2
	# regression using smf
	results = smf.ols('y1 ~ D', data=tmp).fit()
	results.summary2()

	results = smf.ols('y2 ~ D', data=tmp).fit()
	results.summary2()

	#-----------------------------
	# scenario 2
	#----------------------------

	# can we replicate sample with seed = 27

	np.random.seed(63)
	sample_t = trt.sample(6000, replace=True)
	np.random.seed(64)
	sample_c = ctrl.sample(6000, replace=True)

	# stack data
	tmp = pd.concat([sample_t,sample_c],ignore_index=True)

	tmp.head()
	tmp.describe()

	# test impact of treatment 'D' on outcomes y1 and y2
	# regression using smf
	results = smf.ols('y1 ~ D', data=tmp).fit()
	results.summary2()

	results = smf.ols('y2 ~ D', data=tmp).fit()
	results.summary2()