Last active
July 31, 2016 19:05
-
-
Save goldsborough/f4df3da90b1b5a2184379b8a0b542b43 to your computer and use it in GitHub Desktop.
Some popular hash function implementations in Python.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import math | |
import random | |
WORD_SIZE = 32 | |
WORD_MAX = 1 << 32 | |
WORD_MASK = WORD_MAX - 1 | |
def is_prime(number): | |
if number < 2: | |
return False | |
if number == 2 or number == 3: | |
return True | |
if number % 2 == 0 or number % 3 == 0: | |
return False | |
limit = int(math.ceil(math.sqrt(number))) | |
for k in range(5, limit): | |
if number % k == 0: | |
return False | |
if number % (k + 2) == 0: | |
return False | |
return True | |
def is_odd(number): | |
return number % 2 == 1 | |
def is_even(number): | |
return number % 2 == 0 | |
def is_power_of_two(number): | |
return number & (number - 1) == 0 | |
def fits_into_word(number): | |
return number < WORD_MAX | |
def fast_log(number): | |
assert is_power_of_two(number) | |
assert fits_into_word(number) | |
masks = [0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000] | |
log = (number & masks[0]) != 0 | |
for i in range(4, 0, -1): | |
log |= ((number & masks[i]) != 0) << i | |
return log | |
def simple_universal(key, a, b, p, table_size): | |
assert is_prime(p) | |
return (a * hash(key) + b % p) % table_size | |
def simple_two_universal(key, a, p, table_size): | |
assert is_prime(p) | |
return (a * hash(key) % p) % table_size | |
def generate_simple_universal(table_size): | |
a = random.randrange(1, WORD_MAX) | |
b = random.randrange(1, WORD_MAX) | |
while True: | |
p = random.randrange(table_size, WORD_MAX) | |
if is_prime(p): | |
break | |
def closured_hash_function(key): | |
return simple_universal(key, a, b, p, table_size) | |
return closured_hash_function | |
def fast_universal(key, a, b, table_bits): | |
# A should be odd and not close to a power of 2 | |
assert is_odd(a) | |
# Usually the & WORD_MASK would just be a cast | |
return ((a * hash(key) + b) & WORD_MASK) >> (WORD_SIZE - table_bits) | |
def fast_two_universal(key, a, table_bits): | |
# A should be odd and not close to a power of 2 | |
assert is_odd(a) | |
# Usually the & WORD_MASK would just be a cast | |
return ((a * hash(key)) & WORD_MASK) >> (WORD_SIZE - table_bits) | |
def generate_fast_universal(table_size): | |
# a should be odd and not close to a power of 2 (have many bits set) | |
a = random.randrange(1, WORD_MAX) | |
# If it's a power of 2, we substract one to make it have many bits set | |
# If it's just even but no a power of 2, this still makes it odd | |
if is_even(a): | |
a -= 1 | |
# The number of bits to encode an index into the table | |
# We assume the table size is a power of 2 | |
table_bits = fast_log(table_size) | |
# b should be less than 2^(w - M) where M is m = 2^M | |
b = random.randrange(1 << (WORD_SIZE - table_bits)) | |
def closured_hash_function(key): | |
return fast_universal(key, a, b, table_bits) | |
return closured_hash_function | |
def djb2(string, table_size): | |
hash_value = 5381 | |
for char in string: | |
hash_value = ((hash_value << 5) + hash_value) + ord(char) | |
return hash_value % table_size | |
def fast_djb2(string, table_bits): | |
hash_value = 5381 | |
for char in string: | |
hash_value = ((hash_value << 5) + hash_value) + ord(char) | |
return hash_value & ((1 << table_bits) - 1) | |
def string_pre_hash(string, seed, initial_value=0): | |
hash_value = initial_value | |
for char in string: | |
hash_value = (hash_value * seed) + ord(char) | |
return hash_value | |
def string_hash(string, table_size, seed, initial_value=0): | |
return string_pre_hash(string, seed, initial_value) % table_size | |
def generate_random_vector(dimension, table_size): | |
return tuple([random.randrange(table_size) for _ in range(dimension)]) | |
def split_into_vector(key, dimension, table_bits): | |
vector = [] | |
mask = (1 << table_bits) - 1 | |
for _ in range(dimension): | |
vector.append(key & mask) | |
key >>= table_bits | |
return vector | |
def vector_hash(key, table_size): | |
assert fits_into_word(key) | |
assert is_power_of_two(table_size) | |
table_bits = fast_log(table_size) | |
dimension = int(math.ceil(WORD_SIZE / table_bits)) | |
hash_vector = generate_random_vector(dimension, table_size) | |
key_vector = split_into_vector(key, dimension, table_bits) | |
return sum([hash_vector[i] * key_vector[i] for i in range(dimension)]) | |
def main(): | |
h = generate_fast_universal(256) | |
print([h(123) for _ in range(5)]) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment