Skip to content

Instantly share code, notes, and snippets.

@ijkilchenko
Created January 22, 2016 01:57
Show Gist options
  • Save ijkilchenko/6cf9795fff45a08b00be to your computer and use it in GitHub Desktop.
Save ijkilchenko/6cf9795fff45a08b00be to your computer and use it in GitHub Desktop.
import random
import math
from hashlib import md5
class MyHashTable(object):
def __init__(self, hash_functions, len_table):
self.hash_functions = hash_functions
self.len_table = len_table
self.table = [-1]*self.len_table
def add_elem(self, elem):
for hash_function in self.hash_functions:
hash_value = hash_function(elem) % self.len_table
self.table[hash_value] = 1
def lookup_elem(self, elem):
for hash_function in self.hash_functions:
hash_value = hash_function(elem) % self.len_table
if self.table[hash_value] == -1:
return -1
return 1
class MyHashTableWithChaining(MyHashTable):
def add_elem(self, elem):
for hash_function in self.hash_functions:
hash_value = hash_function(elem) % self.len_table
if self.table[hash_value] == -1:
self.table[hash_value] = [elem]
else:
if elem not in self.table[hash_value]:
self.table[hash_value].append(elem)
def lookup_elem(self, elem):
for hash_function in self.hash_functions:
hash_value = hash_function(elem) % self.len_table
if self.table[hash_value] == -1 or elem not in self.table[hash_value]:
return -1
return 1
class BloomFilter(object):
def __init__(self, hash_functions, len_smaller_table, hash_function, len_bigger_table):
self.hash_functions = hash_functions
self.len_smaller_table = len_smaller_table
self.hash_function = hash_function
self.len_bigger_table = len_bigger_table
self.small_table = MyHashTable(hash_functions, len_smaller_table)
self.big_table = MyHashTableWithChaining([hash_function], len_bigger_table)
def add_elem(self, elem):
self.small_table.add_elem(elem)
self.big_table.add_elem(elem)
def lookup_elem(self, elem):
small_table_result = self.small_table.lookup_elem(elem)
if small_table_result == -1:
return -1, "true negative"
else:
big_table_result = self.big_table.lookup_elem(elem)
if small_table_result != big_table_result:
return self.big_table.lookup_elem(elem), "false positive"
else:
return self.big_table.lookup_elem(elem), "true positive"
def lookup_elem_with_test_for_false_negatives(self, elem):
small_table_result, result_type = self.lookup_elem(elem)
if not result_type.endswith('positive'):
big_table_result = self.big_table.lookup_elem(elem)
if small_table_result != big_table_result:
assert False
else:
return self.big_table.lookup_elem(elem), "true negative"
else:
return small_table_result, result_type
def test_BloomFilter():
salts = 'the quick brown fox'
def add_salt(method, salt):
def method_with_salt(*args, **kwargs):
args = ((str(args[0]) + str(salt)).encode('utf-8'),) + args[1:]
return int(method(*args, **kwargs).hexdigest(), 16)
return method_with_salt
hash_functions = [add_salt(md5, salt) for salt in salts.split()]
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def make_random_words():
return [''.join([alphabet[math.floor(random.random()*len(alphabet))] for _ in range(10)]) for _ in range(100)]
elements_inside = make_random_words()
elements_outside = make_random_words()
bloom = BloomFilter(hash_functions[1:], int(10e2), hash_functions[0], int(10e4))
for elem in elements_inside:
bloom.add_elem(elem)
small_table_result_type_map = {}
for i, elem in enumerate(elements_inside + elements_outside):
result, small_table_result_type = bloom.lookup_elem_with_test_for_false_negatives(elem)
if i < len(elements_inside):
assert result == 1
else:
assert result == -1
try:
count = small_table_result_type_map[small_table_result_type]
small_table_result_type_map[small_table_result_type] = count + 1
except KeyError:
small_table_result_type_map[small_table_result_type] = 1
print(small_table_result_type_map)
if __name__ == '__main__':
random.seed(12345)
test_BloomFilter()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment