allanj · February 7, 2024 09:10 · allanj · Feb 7, 2024
diff --git a/bootstrap.py b/bootstrap.py
 """
 This is a simple example to show how to calculate the p_value of two models' accuracy
 Bootstrapint t-test
 """
 import random

 random.seed(42)
 # assume we have test set 1000 samples
 # we just create dummy results to demo
 groundtruth = [random.choice(['A', 'B', 'C']) for _ in range(1000)]

 # create the model a predictions, with a probability 0.8, for each position to take the value from groundtruth
 # and with a probability 0.2, to take a random value from ['A', 'B', 'C']
 model_a_predictions = [groundtruth[i] if random.random() < 0.7 else random.choice(['A', 'B', 'C']) for i in range(1000)]

 # create the model a predictions, with a probability 0.6, for each position to take the value from groundtruth
 # and with a probability 0.2, to take a random value from ['A', 'B', 'C']
 model_b_predictions = [groundtruth[i] if random.random() < 0.65 else random.choice(['A', 'B', 'C']) for i in range(1000)]

 ## print current accuracy of model a and model b
 model_a_correct = sum([1 if p == g else 0 for p, g in zip(model_a_predictions, groundtruth)]) / len(groundtruth)
 print(f"model a accuracy: {model_a_correct}")

 model_b_correct = sum([1 if p == g else 0 for p, g in zip(model_b_predictions, groundtruth)]) / len(groundtruth)
 print(f"model b accuracy: {model_b_correct}")

 import time
 time.sleep(2)


 # simulation amount should be as large as possible
 simulation_amount = 10000


 model_a_win_count = 0
 for current_num in range(simulation_amount):

    # randomly take 1000 samples with replacement
    # 1000 is the size of the test set
    gts = []
    model_a = []
    model_b = []
    for _ in range(len(groundtruth)):
        idx = random.randint(0, len(groundtruth) - 1)
        gt_sample = groundtruth[idx]
        model_a_sample = model_a_predictions[idx]
        model_b_sample = model_b_predictions[idx]
        gts.append(gt_sample)
        model_a.append(model_a_sample)
        model_b.append(model_b_sample)

    # calculate the accuracy of model a
    model_a_correct = [1 if p == g else 0 for p, g in zip(model_a, gts)]
    model_a_correct_sum = sum(model_a_correct)
    model_a_accuracy = model_a_correct_sum / len(groundtruth)

    # calculate the accuracy of model b
    model_b_correct = [1 if p == g else 0 for p, g in zip(model_b, gts)]
    model_b_correct_sum = sum(model_b_correct)
    model_b_accuracy = model_b_correct_sum / len(groundtruth)

    model_a_win_count = model_a_win_count + 1 if model_a_accuracy > model_b_accuracy else model_a_win_count

    # when it runs longer, it will converge to a more accurate p_value
    # just roughly look at the printing results
    print(f"model A significantly better than B with current p_value: { 1 - model_a_win_count / simulation_amount}")

 print(f"model A significantly better than B with p_value: {1 - model_a_win_count / simulation_amount}")
	"""
	This is a simple example to show how to calculate the p_value of two models' accuracy
	Bootstrapint t-test
	"""
	import random

	random.seed(42)
	# assume we have test set 1000 samples
	# we just create dummy results to demo
	groundtruth = [random.choice(['A', 'B', 'C']) for _ in range(1000)]

	# create the model a predictions, with a probability 0.8, for each position to take the value from groundtruth
	# and with a probability 0.2, to take a random value from ['A', 'B', 'C']
	model_a_predictions = [groundtruth[i] if random.random() < 0.7 else random.choice(['A', 'B', 'C']) for i in range(1000)]

	# create the model a predictions, with a probability 0.6, for each position to take the value from groundtruth
	# and with a probability 0.2, to take a random value from ['A', 'B', 'C']
	model_b_predictions = [groundtruth[i] if random.random() < 0.65 else random.choice(['A', 'B', 'C']) for i in range(1000)]

	## print current accuracy of model a and model b
	model_a_correct = sum([1 if p == g else 0 for p, g in zip(model_a_predictions, groundtruth)]) / len(groundtruth)
	print(f"model a accuracy: {model_a_correct}")

	model_b_correct = sum([1 if p == g else 0 for p, g in zip(model_b_predictions, groundtruth)]) / len(groundtruth)
	print(f"model b accuracy: {model_b_correct}")

	import time
	time.sleep(2)


	# simulation amount should be as large as possible
	simulation_amount = 10000


	model_a_win_count = 0
	for current_num in range(simulation_amount):

	# randomly take 1000 samples with replacement
	# 1000 is the size of the test set
	gts = []
	model_a = []
	model_b = []
	for _ in range(len(groundtruth)):
	idx = random.randint(0, len(groundtruth) - 1)
	gt_sample = groundtruth[idx]
	model_a_sample = model_a_predictions[idx]
	model_b_sample = model_b_predictions[idx]
	gts.append(gt_sample)
	model_a.append(model_a_sample)
	model_b.append(model_b_sample)

	# calculate the accuracy of model a
	model_a_correct = [1 if p == g else 0 for p, g in zip(model_a, gts)]
	model_a_correct_sum = sum(model_a_correct)
	model_a_accuracy = model_a_correct_sum / len(groundtruth)

	# calculate the accuracy of model b
	model_b_correct = [1 if p == g else 0 for p, g in zip(model_b, gts)]
	model_b_correct_sum = sum(model_b_correct)
	model_b_accuracy = model_b_correct_sum / len(groundtruth)

	model_a_win_count = model_a_win_count + 1 if model_a_accuracy > model_b_accuracy else model_a_win_count

	# when it runs longer, it will converge to a more accurate p_value
	# just roughly look at the printing results
	print(f"model A significantly better than B with current p_value: { 1 - model_a_win_count / simulation_amount}")

	print(f"model A significantly better than B with p_value: {1 - model_a_win_count / simulation_amount}")