Skip to content

Instantly share code, notes, and snippets.

@ericfrederich
Last active August 29, 2015 14:14
Show Gist options
  • Save ericfrederich/803fe98c544f7407c8c9 to your computer and use it in GitHub Desktop.
Save ericfrederich/803fe98c544f7407c8c9 to your computer and use it in GitHub Desktop.
Example bloom filter implementation / test in Python
#!/usr/bin/env python
import hashlib
from math import ceil, log
def hasher(x, n):
ret = []
h = hashlib.sha1()
h.update(x)
for i in range(n):
h.update('-' * n)
ret.append(int(h.hexdigest(), 16))
return ret
class Bloom(object):
def __init__(self, n_items=None, probability=None, n_bits=None, n_hashes=None):
# either specify number of items and probability...
if n_items is not None and probability is not None:
assert n_bits is None and n_hashes is None
# from http://hur.st/bloomfilter
n_bits = int(ceil((n_items * log(probability)) / log(1.0 / (pow(2.0, log(2.0))))))
n_hashes = int(round(log(2.0) * n_bits / n_items))
print 'bits :', n_bits
print 'hashes:', n_hashes
# or number of bits and number of hashes
elif n_bits is not None and n_hashes is not None:
assert n_items is None and probability is None
self.n_bits = n_bits
self.n_hashes = n_hashes
self.data = [0] * n_bits
def add(self, x):
for result in hasher(x, self.n_hashes):
self.data[result % self.n_bits] += 1
def remove(self, x):
indices = [result % self.n_bits for result in hasher(x, self.n_hashes)]
for i in indices:
assert self.data[i] != 0
for i in indices:
self.data[i] -= 1
def contains(self, x):
for result in hasher(x, self.n_hashes):
if not self.data[result % self.n_bits]:
return False
return True
if __name__ == '__main__':
import sys
probability = float(sys.argv[1])
words = open('/usr/share/dict/words').read().splitlines()
b = Bloom(len(words) / 2, probability)
# build up a bloom filter containing half of the words
for word in words[:len(words)/2]:
# definite collision since words is unique
# if b.contains(word):
# print "uh oh", word
b.add(word)
collisions = 0
# check to see if the other half has the desired probability
for word in words[len(words)/2:]:
if b.contains(word):
collisions += 1
print collisions, 'collisions'
print collisions / (len(words) / 2.0), 'vs', probability
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment