Created
March 6, 2025 18:20
-
-
Save qpwo/104c01c509b192672ffdbef14fb8fca4 to your computer and use it in GitHub Desktop.
53-bit hash collision (birthday paradox for int53)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mpmath as mp | |
import math | |
# Set high precision | |
mp.mp.dps = 50 | |
def calculate_collision_probability(n, d): | |
""" | |
Calculate probability of at least one collision when choosing n items from d possibilities | |
Using the birthday paradox/collision probability formula: 1 - exp(-n(n-1)/(2d)) | |
n: number of rows/items | |
d: number of possible values (2^53 for 53-bit IDs) | |
""" | |
n = mp.mpf(n) | |
d = mp.mpf(d) | |
# Exact calculation using 1 - (d!/(d^n * (d-n)!)) | |
# For large numbers, we use approximation: 1 - exp(-n(n-1)/(2d)) | |
exponent = -n * (n - 1) / (2 * d) | |
probability = 1 - mp.exp(exponent) | |
return probability | |
def binary_search_for_threshold(target_probability, d): | |
"""Find number of items needed for target collision probability using binary search""" | |
low = 1 | |
# Initial upper bound - we know it can't be more than sqrt(2*d*ln(1/(1-p))) | |
high = int(mp.sqrt(2 * d * mp.log(1/(1-target_probability))) * 1.1) | |
while high - low > 1: | |
mid = (low + high) // 2 | |
p = calculate_collision_probability(mid, d) | |
if p < target_probability: | |
low = mid | |
else: | |
high = mid | |
return high | |
# Total possible values with 53-bit IDs | |
total_possible_values = mp.mpf(2) ** 53 | |
# Target probability (50%) | |
target_probability = mp.mpf(0.5) | |
# Calculate using binary search | |
rows_needed = binary_search_for_threshold(target_probability, total_possible_values) | |
# For verification, calculate probability with the found value | |
final_probability = calculate_collision_probability(rows_needed, total_possible_values) | |
print(f"Number of rows needed for 50% collision probability with 53-bit IDs: {rows_needed:,}") | |
print(f"Verification - Collision probability with {rows_needed:,} rows: {final_probability}") | |
# We can also use the approximation formula sqrt(2*d*ln(2)) | |
approximation = mp.sqrt(2 * total_possible_values * mp.log(2)) | |
print(f"Approximation using sqrt(2*d*ln(2)): {int(approximation):,}") | |
# -> | |
# Number of rows needed for 50% collision probability with 53-bit IDs: 111,743,589 | |
# Verification - Collision probability with 111,743,589 rows: 0.5000000009583060331150473405099425149363224710168 | |
# Approximation using sqrt(2*d*ln(2)): 111,743,588 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment