Last active
August 29, 2015 14:14
-
-
Save ericfrederich/803fe98c544f7407c8c9 to your computer and use it in GitHub Desktop.
Example bloom filter implementation / test in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import hashlib | |
from math import ceil, log | |
def hasher(x, n): | |
ret = [] | |
h = hashlib.sha1() | |
h.update(x) | |
for i in range(n): | |
h.update('-' * n) | |
ret.append(int(h.hexdigest(), 16)) | |
return ret | |
class Bloom(object): | |
def __init__(self, n_items=None, probability=None, n_bits=None, n_hashes=None): | |
# either specify number of items and probability... | |
if n_items is not None and probability is not None: | |
assert n_bits is None and n_hashes is None | |
# from http://hur.st/bloomfilter | |
n_bits = int(ceil((n_items * log(probability)) / log(1.0 / (pow(2.0, log(2.0)))))) | |
n_hashes = int(round(log(2.0) * n_bits / n_items)) | |
print 'bits :', n_bits | |
print 'hashes:', n_hashes | |
# or number of bits and number of hashes | |
elif n_bits is not None and n_hashes is not None: | |
assert n_items is None and probability is None | |
self.n_bits = n_bits | |
self.n_hashes = n_hashes | |
self.data = [0] * n_bits | |
def add(self, x): | |
for result in hasher(x, self.n_hashes): | |
self.data[result % self.n_bits] += 1 | |
def remove(self, x): | |
indices = [result % self.n_bits for result in hasher(x, self.n_hashes)] | |
for i in indices: | |
assert self.data[i] != 0 | |
for i in indices: | |
self.data[i] -= 1 | |
def contains(self, x): | |
for result in hasher(x, self.n_hashes): | |
if not self.data[result % self.n_bits]: | |
return False | |
return True | |
if __name__ == '__main__': | |
import sys | |
probability = float(sys.argv[1]) | |
words = open('/usr/share/dict/words').read().splitlines() | |
b = Bloom(len(words) / 2, probability) | |
# build up a bloom filter containing half of the words | |
for word in words[:len(words)/2]: | |
# definite collision since words is unique | |
# if b.contains(word): | |
# print "uh oh", word | |
b.add(word) | |
collisions = 0 | |
# check to see if the other half has the desired probability | |
for word in words[len(words)/2:]: | |
if b.contains(word): | |
collisions += 1 | |
print collisions, 'collisions' | |
print collisions / (len(words) / 2.0), 'vs', probability |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment