Created
February 7, 2024 09:10
-
-
Save allanj/370e2edb6dcb0a777013887c3a9f14ed to your computer and use it in GitHub Desktop.
Bootstraping t test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is a simple example to show how to calculate the p_value of two models' accuracy | |
Bootstrapint t-test | |
""" | |
import random | |
random.seed(42) | |
# assume we have test set 1000 samples | |
# we just create dummy results to demo | |
groundtruth = [random.choice(['A', 'B', 'C']) for _ in range(1000)] | |
# create the model a predictions, with a probability 0.8, for each position to take the value from groundtruth | |
# and with a probability 0.2, to take a random value from ['A', 'B', 'C'] | |
model_a_predictions = [groundtruth[i] if random.random() < 0.7 else random.choice(['A', 'B', 'C']) for i in range(1000)] | |
# create the model a predictions, with a probability 0.6, for each position to take the value from groundtruth | |
# and with a probability 0.2, to take a random value from ['A', 'B', 'C'] | |
model_b_predictions = [groundtruth[i] if random.random() < 0.65 else random.choice(['A', 'B', 'C']) for i in range(1000)] | |
## print current accuracy of model a and model b | |
model_a_correct = sum([1 if p == g else 0 for p, g in zip(model_a_predictions, groundtruth)]) / len(groundtruth) | |
print(f"model a accuracy: {model_a_correct}") | |
model_b_correct = sum([1 if p == g else 0 for p, g in zip(model_b_predictions, groundtruth)]) / len(groundtruth) | |
print(f"model b accuracy: {model_b_correct}") | |
import time | |
time.sleep(2) | |
# simulation amount should be as large as possible | |
simulation_amount = 10000 | |
model_a_win_count = 0 | |
for current_num in range(simulation_amount): | |
# randomly take 1000 samples with replacement | |
# 1000 is the size of the test set | |
gts = [] | |
model_a = [] | |
model_b = [] | |
for _ in range(len(groundtruth)): | |
idx = random.randint(0, len(groundtruth) - 1) | |
gt_sample = groundtruth[idx] | |
model_a_sample = model_a_predictions[idx] | |
model_b_sample = model_b_predictions[idx] | |
gts.append(gt_sample) | |
model_a.append(model_a_sample) | |
model_b.append(model_b_sample) | |
# calculate the accuracy of model a | |
model_a_correct = [1 if p == g else 0 for p, g in zip(model_a, gts)] | |
model_a_correct_sum = sum(model_a_correct) | |
model_a_accuracy = model_a_correct_sum / len(groundtruth) | |
# calculate the accuracy of model b | |
model_b_correct = [1 if p == g else 0 for p, g in zip(model_b, gts)] | |
model_b_correct_sum = sum(model_b_correct) | |
model_b_accuracy = model_b_correct_sum / len(groundtruth) | |
model_a_win_count = model_a_win_count + 1 if model_a_accuracy > model_b_accuracy else model_a_win_count | |
# when it runs longer, it will converge to a more accurate p_value | |
# just roughly look at the printing results | |
print(f"model A significantly better than B with current p_value: { 1 - model_a_win_count / simulation_amount}") | |
print(f"model A significantly better than B with p_value: {1 - model_a_win_count / simulation_amount}") |
simulation_amount = 10000
can be 100000, or even larger, as large as possible
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The results will be printed like this:
As the iteration goes further, the p_value converge to a stable number