Last active
November 9, 2022 14:58
-
-
Save miguno/f4882d0b40f6ef2368c9705a9a277793 to your computer and use it in GitHub Desktop.
HyperLogLog (HLL) experiments: register bits, register counts, estimation error, error bounds
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Runs HLL experiments, using randomly generated strings as input data. | |
# | |
# Requirements | |
# ============ | |
# | |
# # https://github.com/ascv/HyperLogLog | |
# $ pip install HLL | |
# | |
# | |
# How to use | |
# ========== | |
# | |
# $ python3 hll.py | |
# | |
# | |
# References | |
# ========== | |
# https://research.neustar.biz/2012/12/17/hll-intersections-2/ | |
# http://dsinpractice.com/2015/09/07/counting-unique-items-fast-unions-and-intersections/ | |
# https://blog.process-one.net/cardinality-estimation/ | |
# https://blog.demofox.org/2015/03/09/hyperloglog-estimate-unique-value-counts-like-the-pros/ | |
# http://druid.io/blog/2014/02/18/hyperloglog-optimizations-for-real-world-systems.html | |
# https://github.com/ascv/HyperLogLog/issues/9 | |
from HLL import HyperLogLog | |
import math | |
import random | |
import string | |
expCardinality = 100000 | |
# From: https://stackoverflow.com/a/2257449/1743580 | |
def random_string(size=16, chars=string.ascii_uppercase + string.digits): | |
return ''.join(random.choice(chars) for _ in range(size)) | |
# This is a cryptographically more secure variant of `random_string`. | |
# From: https://stackoverflow.com/a/2257449/1743580 | |
def secure_random_string(size=16, chars=string.ascii_uppercase + string.digits): | |
return ''.join(random.SystemRandom().choice(chars) for _ in range(size)) | |
print("HyperLogLog (HLL) experiments") | |
print("=============================") | |
print("True (expected) cardinality: %d" % expCardinality) | |
print(" 68% 95% 99% ") | |
print("+---------------+----------------+--------------------+--------------+--------+--------+--------+") | |
print("| Register bits | Register count | Estim. cardinality | Estim. error | σ | 2σ | 3σ | Within 2σ?") | |
print("+---------------+----------------+--------------------+--------------+--------+--------+--------+") | |
# log2m is the variable name used by the HLL implementation of https://github.com/addthis/stream-lib | |
# = k (HLL python module) | |
# = number of HLL register bits (aka bit indices) | |
for log2m in range(2, 17): | |
hll = HyperLogLog(log2m) | |
registers = math.pow(2, log2m) | |
for x in range(expCardinality): | |
hll.add(random_string()) | |
estimationError = math.fabs((expCardinality - hll.cardinality()) / expCardinality * 1.0) | |
# standard error = sigma = σ = 68% of cases (assumption: normal distribution) | |
sigma = (1.04 / math.sqrt(registers)) | |
# 2σ = 95% of cases | |
sigma2 = 2 * sigma | |
# 3σ = 99% of cases | |
sigma3 = 3 * sigma | |
withinErrorBounds = "" | |
if estimationError < sigma2: | |
withinErrorBounds = "✔" | |
print("| %2d | %8d | %9.1f | %3.4f | %3.4f | %3.4f | %3.4f | %s" % (log2m, registers, hll.cardinality(), estimationError, sigma, sigma2, sigma3, withinErrorBounds)) | |
print("+---------------+----------------+--------------------+--------------+--------+--------+--------+") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output: