Last active
November 14, 2024 16:57
-
-
Save pervognsen/b21f6dd13f4bcb4ff2123f0d78fcfd17 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Hash, displace, and compress: http://cmph.sourceforge.net/papers/esa09.pdf | |
# This is expected linear time for any seeded hash function that acts like a random hash function (universality isn't enough). | |
# (Actually, the code as written is O(n log n) when targeting 100% load. It's O(n) when targeting any smaller load factor.) | |
# You can make keys_per_bucket higher than the default of 4 but construction time will start to increase dramatically. | |
# The paper this is based on compresses the seeds (so the fact that the algorithm tries seeds in increasing order is important) | |
# which brings the representation size close to the information-theoretical minimum. I don't do any of that here, but it could | |
# be done as a postprocess. | |
def make_perfect_hash(keys, load_factor=1.0, keys_per_bucket=4, rhash=murmurhash, max_seed=1000000): | |
m = int(len(keys) / load_factor) | |
r = int(len(keys) / keys_per_bucket) | |
buckets = [[] for i in range(r)] | |
for k in keys: buckets[rhash(0, k, r)].append(k) | |
occupied = set() | |
seeds = [0 for i in range(r)] | |
for i, bucket in sorted(enumerate(buckets), reverse=True, key=lambda x: len(x[1])): | |
for seed in range(max_seed): | |
bucket_occupied = set() | |
for k in bucket: | |
h = rhash(seed, k, m) | |
if h in bucket_occupied or h in occupied: break | |
bucket_occupied.add(h) | |
else: | |
occupied.update(bucket_occupied) | |
seeds[i] = seed | |
break | |
else: | |
raise ValueError("Failed to find perfect hash function") | |
return lambda k: rhash(seeds[rhash(0, k, r)], k, m) | |
words = open("words.txt").read().splitlines() # ~100,000 words | |
perfect_hash = make_perfect_hash(words) # ~30 seconds | |
assert sorted(perfect_hash(word) for word in words) == list(range(len(words))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment