Skip to content

Instantly share code, notes, and snippets.

@qpwo
Created March 6, 2025 18:20
Show Gist options
  • Save qpwo/104c01c509b192672ffdbef14fb8fca4 to your computer and use it in GitHub Desktop.
Save qpwo/104c01c509b192672ffdbef14fb8fca4 to your computer and use it in GitHub Desktop.
53-bit hash collision (birthday paradox for int53)
import mpmath as mp
import math
# Set high precision
mp.mp.dps = 50
def calculate_collision_probability(n, d):
"""
Calculate probability of at least one collision when choosing n items from d possibilities
Using the birthday paradox/collision probability formula: 1 - exp(-n(n-1)/(2d))
n: number of rows/items
d: number of possible values (2^53 for 53-bit IDs)
"""
n = mp.mpf(n)
d = mp.mpf(d)
# Exact calculation using 1 - (d!/(d^n * (d-n)!))
# For large numbers, we use approximation: 1 - exp(-n(n-1)/(2d))
exponent = -n * (n - 1) / (2 * d)
probability = 1 - mp.exp(exponent)
return probability
def binary_search_for_threshold(target_probability, d):
"""Find number of items needed for target collision probability using binary search"""
low = 1
# Initial upper bound - we know it can't be more than sqrt(2*d*ln(1/(1-p)))
high = int(mp.sqrt(2 * d * mp.log(1/(1-target_probability))) * 1.1)
while high - low > 1:
mid = (low + high) // 2
p = calculate_collision_probability(mid, d)
if p < target_probability:
low = mid
else:
high = mid
return high
# Total possible values with 53-bit IDs
total_possible_values = mp.mpf(2) ** 53
# Target probability (50%)
target_probability = mp.mpf(0.5)
# Calculate using binary search
rows_needed = binary_search_for_threshold(target_probability, total_possible_values)
# For verification, calculate probability with the found value
final_probability = calculate_collision_probability(rows_needed, total_possible_values)
print(f"Number of rows needed for 50% collision probability with 53-bit IDs: {rows_needed:,}")
print(f"Verification - Collision probability with {rows_needed:,} rows: {final_probability}")
# We can also use the approximation formula sqrt(2*d*ln(2))
approximation = mp.sqrt(2 * total_possible_values * mp.log(2))
print(f"Approximation using sqrt(2*d*ln(2)): {int(approximation):,}")
# ->
# Number of rows needed for 50% collision probability with 53-bit IDs: 111,743,589
# Verification - Collision probability with 111,743,589 rows: 0.5000000009583060331150473405099425149363224710168
# Approximation using sqrt(2*d*ln(2)): 111,743,588
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment