Skip to content

Instantly share code, notes, and snippets.

@allanj
Created February 7, 2024 09:10
Show Gist options
  • Save allanj/370e2edb6dcb0a777013887c3a9f14ed to your computer and use it in GitHub Desktop.
Save allanj/370e2edb6dcb0a777013887c3a9f14ed to your computer and use it in GitHub Desktop.
Bootstraping t test
"""
This is a simple example to show how to calculate the p_value of two models' accuracy
Bootstrapint t-test
"""
import random
random.seed(42)
# assume we have test set 1000 samples
# we just create dummy results to demo
groundtruth = [random.choice(['A', 'B', 'C']) for _ in range(1000)]
# create the model a predictions, with a probability 0.8, for each position to take the value from groundtruth
# and with a probability 0.2, to take a random value from ['A', 'B', 'C']
model_a_predictions = [groundtruth[i] if random.random() < 0.7 else random.choice(['A', 'B', 'C']) for i in range(1000)]
# create the model a predictions, with a probability 0.6, for each position to take the value from groundtruth
# and with a probability 0.2, to take a random value from ['A', 'B', 'C']
model_b_predictions = [groundtruth[i] if random.random() < 0.65 else random.choice(['A', 'B', 'C']) for i in range(1000)]
## print current accuracy of model a and model b
model_a_correct = sum([1 if p == g else 0 for p, g in zip(model_a_predictions, groundtruth)]) / len(groundtruth)
print(f"model a accuracy: {model_a_correct}")
model_b_correct = sum([1 if p == g else 0 for p, g in zip(model_b_predictions, groundtruth)]) / len(groundtruth)
print(f"model b accuracy: {model_b_correct}")
import time
time.sleep(2)
# simulation amount should be as large as possible
simulation_amount = 10000
model_a_win_count = 0
for current_num in range(simulation_amount):
# randomly take 1000 samples with replacement
# 1000 is the size of the test set
gts = []
model_a = []
model_b = []
for _ in range(len(groundtruth)):
idx = random.randint(0, len(groundtruth) - 1)
gt_sample = groundtruth[idx]
model_a_sample = model_a_predictions[idx]
model_b_sample = model_b_predictions[idx]
gts.append(gt_sample)
model_a.append(model_a_sample)
model_b.append(model_b_sample)
# calculate the accuracy of model a
model_a_correct = [1 if p == g else 0 for p, g in zip(model_a, gts)]
model_a_correct_sum = sum(model_a_correct)
model_a_accuracy = model_a_correct_sum / len(groundtruth)
# calculate the accuracy of model b
model_b_correct = [1 if p == g else 0 for p, g in zip(model_b, gts)]
model_b_correct_sum = sum(model_b_correct)
model_b_accuracy = model_b_correct_sum / len(groundtruth)
model_a_win_count = model_a_win_count + 1 if model_a_accuracy > model_b_accuracy else model_a_win_count
# when it runs longer, it will converge to a more accurate p_value
# just roughly look at the printing results
print(f"model A significantly better than B with current p_value: { 1 - model_a_win_count / simulation_amount}")
print(f"model A significantly better than B with p_value: {1 - model_a_win_count / simulation_amount}")
@allanj
Copy link
Author

allanj commented Feb 7, 2024

The results will be printed like this:

model A significantly better than B with current p _value: 0.06269999999999998
model A significantly better than B with current p _value: 0.06259999999999999
model A significantly better than B with current p _value: 0.0625
model A significantly better than B with current p _value: 0.06240000000000001
model A significantly better than B with current p _value: 0.06230000000000002
model A significantly better than B with current p _value: 0.06220000000000003
model A significantly better than B with current p _value: 0.062100000000000044
model A significantly better than B with p_value: 0.062100000000000044

As the iteration goes further, the p_value converge to a stable number

@allanj
Copy link
Author

allanj commented Feb 7, 2024

simulation_amount = 10000
can be 100000, or even larger, as large as possible

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment