ericfrederich · August 29, 2015 14:14
diff --git a/bloom_filter.py b/bloom_filter.py
 #!/usr/bin/env python

 import hashlib
 from math import ceil, log

 def hasher(x, n):
    ret = []
    h = hashlib.sha1()
    h.update(x)
    for i in range(n):
        h.update('-' * n)
        ret.append(int(h.hexdigest(), 16))
    return ret

 class Bloom(object):

    def __init__(self, n_items=None, probability=None, n_bits=None, n_hashes=None):
        # either specify number of items and probability...
        if n_items is not None and probability is not None:
            assert n_bits is None and n_hashes is None

            # from http://hur.st/bloomfilter
            n_bits = int(ceil((n_items * log(probability)) / log(1.0 / (pow(2.0, log(2.0))))))
            n_hashes = int(round(log(2.0) * n_bits / n_items))
            print 'bits  :', n_bits
            print 'hashes:', n_hashes

        # or number of bits and number of hashes
        elif n_bits is not None and n_hashes is not None:
            assert n_items is None and probability is None

        self.n_bits = n_bits
        self.n_hashes = n_hashes
        self.data = [0] * n_bits

    def add(self, x):
        for result in hasher(x, self.n_hashes):
            self.data[result % self.n_bits] += 1

    def remove(self, x):
        indices = [result % self.n_bits for result in hasher(x, self.n_hashes)]
        for i in indices:
            assert self.data[i] != 0
        for i in indices:
            self.data[i] -= 1
    
    def contains(self, x):
        for result in hasher(x, self.n_hashes):
            if not self.data[result % self.n_bits]:
                return False
        return True

 if __name__ == '__main__':

    import sys
    probability = float(sys.argv[1])

    words = open('/usr/share/dict/words').read().splitlines()

    b = Bloom(len(words) / 2, probability)

    # build up a bloom filter containing half of the words
    for word in words[:len(words)/2]:
        # definite collision since words is unique
        # if b.contains(word):
        #     print "uh oh", word
        b.add(word)

    collisions = 0
    # check to see if the other half has the desired probability
    for word in words[len(words)/2:]:
        if b.contains(word):
            collisions += 1

    print collisions, 'collisions'
    print collisions / (len(words) / 2.0), 'vs', probability
	#!/usr/bin/env python

	import hashlib
	from math import ceil, log

	def hasher(x, n):
	ret = []
	h = hashlib.sha1()
	h.update(x)
	for i in range(n):
	h.update('-' * n)
	ret.append(int(h.hexdigest(), 16))
	return ret

	class Bloom(object):

	def __init__(self, n_items=None, probability=None, n_bits=None, n_hashes=None):
	# either specify number of items and probability...
	if n_items is not None and probability is not None:
	assert n_bits is None and n_hashes is None

	# from http://hur.st/bloomfilter
	n_bits = int(ceil((n_items * log(probability)) / log(1.0 / (pow(2.0, log(2.0))))))
	n_hashes = int(round(log(2.0) * n_bits / n_items))
	print 'bits :', n_bits
	print 'hashes:', n_hashes

	# or number of bits and number of hashes
	elif n_bits is not None and n_hashes is not None:
	assert n_items is None and probability is None

	self.n_bits = n_bits
	self.n_hashes = n_hashes
	self.data = [0] * n_bits

	def add(self, x):
	for result in hasher(x, self.n_hashes):
	self.data[result % self.n_bits] += 1

	def remove(self, x):
	indices = [result % self.n_bits for result in hasher(x, self.n_hashes)]
	for i in indices:
	assert self.data[i] != 0
	for i in indices:
	self.data[i] -= 1

	def contains(self, x):
	for result in hasher(x, self.n_hashes):
	if not self.data[result % self.n_bits]:
	return False
	return True

	if __name__ == '__main__':

	import sys
	probability = float(sys.argv[1])

	words = open('/usr/share/dict/words').read().splitlines()

	b = Bloom(len(words) / 2, probability)

	# build up a bloom filter containing half of the words
	for word in words[:len(words)/2]:
	# definite collision since words is unique
	# if b.contains(word):
	# print "uh oh", word
	b.add(word)

	collisions = 0
	# check to see if the other half has the desired probability
	for word in words[len(words)/2:]:
	if b.contains(word):
	collisions += 1

	print collisions, 'collisions'
	print collisions / (len(words) / 2.0), 'vs', probability