Last active
July 7, 2017 18:16
-
-
Save johannah/6d8bd3584e0c498136477740dfbfdd20 to your computer and use it in GitHub Desktop.
stats for hackers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
# example taken from Jake Vanderplas' talk "Statistics for Hackers" | |
def is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=False,null_hyp=.05): | |
# find difference of means to test null hyp | |
odiff = np.mean(s1)-np.mean(s2) | |
# join the lists together | |
a = np.array(s1+s2) | |
asize = a.shape[0] | |
inds = np.zeros(asize) | |
diffs = [] | |
for t in range(trials): | |
# set random indexes to zero | |
inds[:] = 0 | |
# generate random indexes to simulate s2 | |
g2inds = np.random.randint(low=0, high=asize, size=len(s2)) | |
inds[g2inds] = 1 | |
s1sm = np.mean(a[inds==0]) | |
s2sm = np.mean(a[inds==1]) | |
d = s1sm-s2sm | |
diffs.append(d) | |
num_greater = np.sum([np.array(diffs)>odiff]) | |
greater_than_null = num_greater/float(trials) | |
is_significant = greater_than_null<null_hyp | |
if do_plot: | |
plt.title("Obs diff: %s s1>s2: %s at p==%s" %(odiff, is_significant, null_hyp)) | |
plt.hist(diffs, bins=50) | |
plt.axvline(odiff, color='r', linestyle='dashed', linewidth=2) | |
plt.show() | |
return is_significant, greater_than_null | |
def bootstrap_resampling(stack, num_samples=10000): | |
# idea: simulate the distribution by drawing samples with replacement | |
# motivation: the data estimates its own distribution | |
stack = np.array(stack) | |
stack_size = stack.shape[0] | |
sample_size = int(stack_size*.5) | |
xbar = np.zeros(num_samples) | |
for i in range(num_samples): | |
ind = np.random.randint(low=0,high=stack_size,size=sample_size) | |
sample = stack[ind] | |
xbar[i] = np.mean(sample) | |
print('mean', np.mean(xbar), '+-', np.std(xbar)) | |
return np.mean(xbar), np.std(xbar) | |
if __name__ == '__main__': | |
# test scores. Is s1 smarter than s2? | |
s1 = [84, 57, 63, 99, 72, 46, 76, 91] | |
s2 = [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69] | |
#is_significant, gtn = is_statistically_significant_shuffle(s1,s2,trials=10000,do_plot=True) | |
#print("s1 is statistically better than s2:", is_significant) | |
yurtles = [48,24,51,12,21,41,25,23,32,61,19,24,29,21,23,13,32,18,42,18] | |
# if given infinite time, what is mean of turtles that yurtle would stack? | |
# what is uncertainty on that estimate? | |
bootstrap_resampling(yurtles) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment