Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save aladinoster/221bddefe18afe0eecb4e42fd86ce5d1 to your computer and use it in GitHub Desktop.
Save aladinoster/221bddefe18afe0eecb4e42fd86ce5d1 to your computer and use it in GitHub Desktop.
Bag of Little Bootstraps (w/ example usage)
import random
from typing import (
List,
Dict,
Callable,
Union,
)
import numpy
Number = Union[int, float]
Array = numpy.array
# A sample stored by the counts of its unique values.
SampleCounts = Dict[Number, int] # unique_value: count
#############################
# Estimators
#############################
SampleValues = Array # [float, ...]
ValueCounts = Array # [int, ...]
Estimate = float
Estimator = Callable[[SampleValues, ValueCounts], Estimate]
def mean_estimator(
values: SampleValues,
counts: ValueCounts,
) -> Estimate:
sum_values = (counts * values).sum()
num_values = counts.sum()
return sum_values / num_values
#############################
# Estimate Quality Assessors
#############################
Estimates = Array # [Estimate, ...]
EstimateAssessor = Callable[[Estimates], Estimate]
def standard_deviation(estimates: Estimates) -> Estimate:
return estimates.std()
def percentile_05(estimates: Estimates) -> Estimate:
return numpy.percentile(estimates, 5)
def percentile_95(estimates: Estimates) -> Estimate:
return numpy.percentile(estimates, 95)
def percentile_NN(nn) -> EstimateAssessor:
assert 0 <= nn <= 100
def p_nn(estimates: Estimates) -> Estimate:
return numpy.percentile(estimates, nn)
return p_nn
#############################
# Algorithm
#############################
def bag_little_bootstraps(
data: SampleCounts,
estimator: Estimator,
estimate_assessors: List[EstimateAssessor],
s=20,
r=50,
b_power=0.65,
) -> List[Estimate]:
'''
Bag of little bootstraps:
https://arxiv.org/pdf/1112.5016.pdf
The default args should be sane enough defaults for a variety of situations,
but if you need to be precise you can optimize them by doing the relative
error analysis in the paper with various values.
Params
data: Input data you sampled from some population
estimator: function that calculates estimate (e.g. mean_estimator)
estimate_assessors: functions that assesses estimates (e.g. standard_deviation)
s: # subsamples
r: # bootstrap resamples per subsample
b_power: Determines size of subsample
'''
data_values = list(data.keys()) # Unique values
data_value_counts = list(data.values()) # Unique value counts
data_size = sum(data_value_counts)
# Determine b
# "b" is the subsample size
b = int(data_size ** b_power)
uniform_subsample_weights = Array([1.0 / b] * b)
# Every assessor gets s subset assessments
assessor_estimates = [] # [[est_1, est_2, est_s], ...]
for _ in estimate_assessors:
assessor_estimates.append(Array([0.0] * s))
# Calculate assessor estimates for every subset
for s_j in range(s):
# Pick a random subset of data
# - The size of the sample is "b"
# - Sample w/out replacement
sample_subset = Array(
random.sample(
data_values,
counts=data_value_counts,
k=b,
)
)
# Bootstrap resamples
# - Calculate an estimate for every resample
b_estimates = Array([0.0] * r)
for b_k in range(r):
# Create a bootstrap resample
# - Bootstrap resample size is the size of the original sample, not
# the size of the subset sample.
b_counts = numpy.random.multinomial(
data_size,
pvals=uniform_subsample_weights,
)
# Calculate estimate of the resampled data
b_estimates[b_k] = estimator(sample_subset, b_counts)
# Calculate an estimate quality assessment for each assessor
for i, estimate_assessor in enumerate(estimate_assessors):
assessor_estimates[i][s_j] = estimate_assessor(b_estimates)
# Average assessor assessments for each subset to get final estimates
final_assessment_estimates = []
for sample_ests in assessor_estimates:
final_assessment_estimates.append(sample_ests.mean())
return final_assessment_estimates
def mean_characterization_example():
'''
Use BLB to estimate the percentiles of the sampling distribution of the mean.
'''
from collections import Counter
# Create a sample Uniform(0, 100)
# - Discretize it to 0.1
N = 1_000_000
print(f"\nCreating data, size: {N} ...")
data = [round(random.random() * 100, 1) for _ in range(N)]
data = Counter(data)
# Run BLB
# - Can swap out mean_estimator for median, ...
# - Can swap out percentile_05 for standard_deviation, ...
print("BLB Bootstrapping...")
estimator = mean_estimator
# estimate_assessors = [percentile_05, percentile_95]
estimate_assessors = [percentile_NN(x) for x in range(101)]
assessor_stats = bag_little_bootstraps(
data,
estimator=estimator,
estimate_assessors=estimate_assessors,
)
print(f" 1st percentile: {assessor_stats[1]}")
print(f"99th percentile: {assessor_stats[99]}")
if __name__ == "__main__":
mean_characterization_example()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment