Skip to content

Instantly share code, notes, and snippets.

@PM2Ring
Created October 2, 2017 17:13
Show Gist options
  • Save PM2Ring/f21d63b6d3a52dcdab315ad88a775daf to your computer and use it in GitHub Desktop.
Save PM2Ring/f21d63b6d3a52dcdab315ad88a775daf to your computer and use it in GitHub Desktop.
Bloom filter demo
#!/usr/bin/env python3
''' Testing a Bloom filter implementation
See https://en.wikipedia.org/wiki/Bloom_filter
Written by PM 2Ring 2017.09.29
'''
from random import seed, randrange
from array import array
from math import ceil, log
from time import perf_counter
def gcd(a, b):
''' Greatest common divisor of a & b '''
while b:
a, b = b, a % b
return a
def timed(func):
''' Timing decorator '''
def wrapped(*args):
start = perf_counter()
result = func(*args)
stop = perf_counter()
print('{:.6f} seconds'.format(stop - start))
return result
wrapped.__name__ = func.__name__
wrapped.__doc__ = func.__doc__
return wrapped
# Build a table that counts 1 bits in bytes
def make_popcount():
popcount = [0, 1]
for i in range(1, 1<<7):
k = popcount[i]
popcount.append(k)
popcount.append(k + 1)
return popcount
class BloomFilter:
def __init__(self, num, fpp):
self.num = num
self.fpp = fpp
self.primemod = (1<<31) - 1
print('Making filter array...')
self.make_filter_array()
print('Making hash function parameters...')
self.make_hash_params()
def make_filter_array(self):
# Calculate the number of hash functions and filter size
# required for the desired false positive probability,
# rounding the filter size up to a multiple of 8.
k = ceil(-log(self.fpp, 2))
m = ceil(-k * self.num / log(1 - self.fpp ** (1 / k))) // -8 * -8
self.numhashes = k
self.filtermod = m
filterlen = m // 8
print('Number of hashes:', k)
print('Filter array size:', m, 'bits =', filterlen, 'bytes')
self.filterarray = array('B', bytes(filterlen))
def make_hash_params(self):
hashparams = []
primemod = self.primemod
numhashes = self.numhashes
while len(hashparams) < numhashes:
a = randrange(1, primemod)
b = randrange(1, primemod)
if a < b:
a, b = b, a
if gcd(a, b) == 1:
hashparams.append((a, b))
self.hashparams = hashparams
def add(self, x):
''' Add a single item to the filter '''
primemod = self.primemod
filtermod = self.filtermod
filterarray = self.filterarray
for a, b in self.hashparams:
y = ((a*x + b) % primemod) % filtermod
filterarray[y >> 3] |= 1 << (y & 7)
@timed
def update(self, data):
''' Add data collection to the filter '''
hashparams = self.hashparams
primemod = self.primemod
filtermod = self.filtermod
filterarray = self.filterarray
for x in data:
for a, b in hashparams:
y = ((a*x + b) % primemod) % filtermod
filterarray[y >> 3] |= 1 << (y & 7)
##About 5% slower
#@timed
#def updateNEW(self, data):
#''' Add data collection to the filter '''
#for x in data:
#self.add(x)
def bitcount(self, popcount=make_popcount()):
count = sum(popcount[v] for v in self.filterarray)
rate = count / self.filtermod
print('Used bits: {}, rate: {:.6f}'.format(count, rate))
def __contains__(self, x):
''' Test if x is in the filter. This may return false
positives but it does not return false negatives
'''
primemod = self.primemod
filtermod = self.filtermod
filterarray = self.filterarray
for a, b in self.hashparams:
y = ((a*x + b) % primemod) % filtermod
if not (filterarray[y >> 3] >> (y & 7)) & 1:
return False
return True
@timed
def make_data(self):
''' Make a full set of random data '''
primemod = self.primemod
return {randrange(primemod) for _ in range(self.num)}
def test(self, testnum):
print('Making data...')
data = self.make_data()
print('Adding data to filter...')
self.update(data)
self.bitcount()
print('Testing all the data...')
self.test_data(data)
self.test_random(data, testnum)
@timed
def test_data(self, data):
bad = 0
for x in data:
if x not in self:
print('Missed {}!'.format(x))
bad += 1
if bad:
print(bad, 'items not found')
else:
print('All items found')
@timed
def test_random(self, data, testnum):
''' Measure false positive ratio by testing with random
numbers that aren't in the filter's data set
'''
print(testnum, 'random tests...')
primemod = self.primemod
i = count = 0
while i < testnum:
x = randrange(primemod)
if x not in data:
i += 1
if x in self:
count += 1
p = count / i
print(' {} / {} = {:.3}'.format(count, i, p),
end='\r', flush=True)
print('\nFalse positives:', count, count / testnum)
def main():
# Number of items that will be added to the filter
num = 100000
# False positives probability
fpp = 1E-5
seed(42)
print('Number of items:', num)
print('Desired false positive probability:', fpp)
bloom = BloomFilter(num, fpp)
bloom.test(10 * num)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment