Last active
April 13, 2025 14:06
-
-
Save cgreer/ed246cde84bafb3149c7a7438a16a1c9 to your computer and use it in GitHub Desktop.
Bag of Little Bootstraps (w/ example usage)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
from typing import ( | |
List, | |
Dict, | |
Callable, | |
Union, | |
) | |
import numpy | |
Number = Union[int, float] | |
Array = numpy.array | |
# A sample stored by the counts of its unique values. | |
SampleCounts = Dict[Number, int] # unique_value: count | |
############################# | |
# Estimators | |
############################# | |
SampleValues = Array # [float, ...] | |
ValueCounts = Array # [int, ...] | |
Estimate = float | |
Estimator = Callable[[SampleValues, ValueCounts], Estimate] | |
def mean_estimator( | |
values: SampleValues, | |
counts: ValueCounts, | |
) -> Estimate: | |
sum_values = (counts * values).sum() | |
num_values = counts.sum() | |
return sum_values / num_values | |
############################# | |
# Estimate Quality Assessors | |
############################# | |
Estimates = Array # [Estimate, ...] | |
EstimateAssessor = Callable[[Estimates], Estimate] | |
def standard_deviation(estimates: Estimates) -> Estimate: | |
return estimates.std() | |
def percentile_05(estimates: Estimates) -> Estimate: | |
return numpy.percentile(estimates, 5) | |
def percentile_95(estimates: Estimates) -> Estimate: | |
return numpy.percentile(estimates, 95) | |
def percentile_NN(nn) -> EstimateAssessor: | |
assert 0 <= nn <= 100 | |
def p_nn(estimates: Estimates) -> Estimate: | |
return numpy.percentile(estimates, nn) | |
return p_nn | |
############################# | |
# Algorithm | |
############################# | |
def bag_little_bootstraps( | |
data: SampleCounts, | |
estimator: Estimator, | |
estimate_assessors: List[EstimateAssessor], | |
s=20, | |
r=50, | |
b_power=0.65, | |
) -> List[Estimate]: | |
''' | |
Bag of little bootstraps: | |
https://arxiv.org/pdf/1112.5016.pdf | |
The default args should be sane enough defaults for a variety of situations, | |
but if you need to be precise you can optimize them by doing the relative | |
error analysis in the paper with various values. | |
Params | |
data: Input data you sampled from some population | |
estimator: function that calculates estimate (e.g. mean_estimator) | |
estimate_assessors: functions that assesses estimates (e.g. standard_deviation) | |
s: # subsamples | |
r: # bootstrap resamples per subsample | |
b_power: Determines size of subsample | |
''' | |
data_values = list(data.keys()) # Unique values | |
data_value_counts = list(data.values()) # Unique value counts | |
data_size = sum(data_value_counts) | |
# Determine b | |
# "b" is the subsample size | |
b = int(data_size ** b_power) | |
uniform_subsample_weights = Array([1.0 / b] * b) | |
# Every assessor gets s subset assessments | |
assessor_estimates = [] # [[est_1, est_2, est_s], ...] | |
for _ in estimate_assessors: | |
assessor_estimates.append(Array([0.0] * s)) | |
# Calculate assessor estimates for every subset | |
for s_j in range(s): | |
# Pick a random subset of data | |
# - The size of the sample is "b" | |
# - Sample w/out replacement | |
sample_subset = Array( | |
random.sample( | |
data_values, | |
counts=data_value_counts, | |
k=b, | |
) | |
) | |
# Bootstrap resamples | |
# - Calculate an estimate for every resample | |
b_estimates = Array([0.0] * r) | |
for b_k in range(r): | |
# Create a bootstrap resample | |
# - Bootstrap resample size is the size of the original sample, not | |
# the size of the subset sample. | |
b_counts = numpy.random.multinomial( | |
data_size, | |
pvals=uniform_subsample_weights, | |
) | |
# Calculate estimate of the resampled data | |
b_estimates[b_k] = estimator(sample_subset, b_counts) | |
# Calculate an estimate quality assessment for each assessor | |
for i, estimate_assessor in enumerate(estimate_assessors): | |
assessor_estimates[i][s_j] = estimate_assessor(b_estimates) | |
# Average assessor assessments for each subset to get final estimates | |
final_assessment_estimates = [] | |
for sample_ests in assessor_estimates: | |
final_assessment_estimates.append(sample_ests.mean()) | |
return final_assessment_estimates | |
def mean_characterization_example(): | |
''' | |
Use BLB to estimate the percentiles of the sampling distribution of the mean. | |
''' | |
from collections import Counter | |
# Create a sample Uniform(0, 100) | |
# - Discretize it to 0.1 | |
N = 1_000_000 | |
print(f"\nCreating data, size: {N} ...") | |
data = [round(random.random() * 100, 1) for _ in range(N)] | |
data = Counter(data) | |
# Run BLB | |
# - Can swap out mean_estimator for median, ... | |
# - Can swap out percentile_05 for standard_deviation, ... | |
print("BLB Bootstrapping...") | |
estimator = mean_estimator | |
# estimate_assessors = [percentile_05, percentile_95] | |
estimate_assessors = [percentile_NN(x) for x in range(101)] | |
assessor_stats = bag_little_bootstraps( | |
data, | |
estimator=estimator, | |
estimate_assessors=estimate_assessors, | |
) | |
print(f" 1st percentile: {assessor_stats[1]}") | |
print(f"99th percentile: {assessor_stats[99]}") | |
if __name__ == "__main__": | |
mean_characterization_example() |
Thanks for this! I just computed a 3Q estimator of a huge dataset >14M lines in a few seconds!
You're welcome!
It's been a while since I implemented that, so it might be worth double-checking it. Maybe doing a slow bootstrap one time over 14M dataset w/ multinomial and seeing if gives the same answer?
The research you do seems really interesting!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this! I just computed a 3Q estimator of a huge dataset >14M lines in a few seconds!