Skip to content

Instantly share code, notes, and snippets.

@goldsborough
Last active July 31, 2016 19:05
Show Gist options
  • Save goldsborough/f4df3da90b1b5a2184379b8a0b542b43 to your computer and use it in GitHub Desktop.
Save goldsborough/f4df3da90b1b5a2184379b8a0b542b43 to your computer and use it in GitHub Desktop.
Some popular hash function implementations in Python.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import math
import random
WORD_SIZE = 32
WORD_MAX = 1 << 32
WORD_MASK = WORD_MAX - 1
def is_prime(number):
if number < 2:
return False
if number == 2 or number == 3:
return True
if number % 2 == 0 or number % 3 == 0:
return False
limit = int(math.ceil(math.sqrt(number)))
for k in range(5, limit):
if number % k == 0:
return False
if number % (k + 2) == 0:
return False
return True
def is_odd(number):
return number % 2 == 1
def is_even(number):
return number % 2 == 0
def is_power_of_two(number):
return number & (number - 1) == 0
def fits_into_word(number):
return number < WORD_MAX
def fast_log(number):
assert is_power_of_two(number)
assert fits_into_word(number)
masks = [0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000]
log = (number & masks[0]) != 0
for i in range(4, 0, -1):
log |= ((number & masks[i]) != 0) << i
return log
def simple_universal(key, a, b, p, table_size):
assert is_prime(p)
return (a * hash(key) + b % p) % table_size
def simple_two_universal(key, a, p, table_size):
assert is_prime(p)
return (a * hash(key) % p) % table_size
def generate_simple_universal(table_size):
a = random.randrange(1, WORD_MAX)
b = random.randrange(1, WORD_MAX)
while True:
p = random.randrange(table_size, WORD_MAX)
if is_prime(p):
break
def closured_hash_function(key):
return simple_universal(key, a, b, p, table_size)
return closured_hash_function
def fast_universal(key, a, b, table_bits):
# A should be odd and not close to a power of 2
assert is_odd(a)
# Usually the & WORD_MASK would just be a cast
return ((a * hash(key) + b) & WORD_MASK) >> (WORD_SIZE - table_bits)
def fast_two_universal(key, a, table_bits):
# A should be odd and not close to a power of 2
assert is_odd(a)
# Usually the & WORD_MASK would just be a cast
return ((a * hash(key)) & WORD_MASK) >> (WORD_SIZE - table_bits)
def generate_fast_universal(table_size):
# a should be odd and not close to a power of 2 (have many bits set)
a = random.randrange(1, WORD_MAX)
# If it's a power of 2, we substract one to make it have many bits set
# If it's just even but no a power of 2, this still makes it odd
if is_even(a):
a -= 1
# The number of bits to encode an index into the table
# We assume the table size is a power of 2
table_bits = fast_log(table_size)
# b should be less than 2^(w - M) where M is m = 2^M
b = random.randrange(1 << (WORD_SIZE - table_bits))
def closured_hash_function(key):
return fast_universal(key, a, b, table_bits)
return closured_hash_function
def djb2(string, table_size):
hash_value = 5381
for char in string:
hash_value = ((hash_value << 5) + hash_value) + ord(char)
return hash_value % table_size
def fast_djb2(string, table_bits):
hash_value = 5381
for char in string:
hash_value = ((hash_value << 5) + hash_value) + ord(char)
return hash_value & ((1 << table_bits) - 1)
def string_pre_hash(string, seed, initial_value=0):
hash_value = initial_value
for char in string:
hash_value = (hash_value * seed) + ord(char)
return hash_value
def string_hash(string, table_size, seed, initial_value=0):
return string_pre_hash(string, seed, initial_value) % table_size
def generate_random_vector(dimension, table_size):
return tuple([random.randrange(table_size) for _ in range(dimension)])
def split_into_vector(key, dimension, table_bits):
vector = []
mask = (1 << table_bits) - 1
for _ in range(dimension):
vector.append(key & mask)
key >>= table_bits
return vector
def vector_hash(key, table_size):
assert fits_into_word(key)
assert is_power_of_two(table_size)
table_bits = fast_log(table_size)
dimension = int(math.ceil(WORD_SIZE / table_bits))
hash_vector = generate_random_vector(dimension, table_size)
key_vector = split_into_vector(key, dimension, table_bits)
return sum([hash_vector[i] * key_vector[i] for i in range(dimension)])
def main():
h = generate_fast_universal(256)
print([h(123) for _ in range(5)])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment