Created
October 2, 2017 17:13
-
-
Save PM2Ring/f21d63b6d3a52dcdab315ad88a775daf to your computer and use it in GitHub Desktop.
Bloom filter demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' Testing a Bloom filter implementation | |
See https://en.wikipedia.org/wiki/Bloom_filter | |
Written by PM 2Ring 2017.09.29 | |
''' | |
from random import seed, randrange | |
from array import array | |
from math import ceil, log | |
from time import perf_counter | |
def gcd(a, b): | |
''' Greatest common divisor of a & b ''' | |
while b: | |
a, b = b, a % b | |
return a | |
def timed(func): | |
''' Timing decorator ''' | |
def wrapped(*args): | |
start = perf_counter() | |
result = func(*args) | |
stop = perf_counter() | |
print('{:.6f} seconds'.format(stop - start)) | |
return result | |
wrapped.__name__ = func.__name__ | |
wrapped.__doc__ = func.__doc__ | |
return wrapped | |
# Build a table that counts 1 bits in bytes | |
def make_popcount(): | |
popcount = [0, 1] | |
for i in range(1, 1<<7): | |
k = popcount[i] | |
popcount.append(k) | |
popcount.append(k + 1) | |
return popcount | |
class BloomFilter: | |
def __init__(self, num, fpp): | |
self.num = num | |
self.fpp = fpp | |
self.primemod = (1<<31) - 1 | |
print('Making filter array...') | |
self.make_filter_array() | |
print('Making hash function parameters...') | |
self.make_hash_params() | |
def make_filter_array(self): | |
# Calculate the number of hash functions and filter size | |
# required for the desired false positive probability, | |
# rounding the filter size up to a multiple of 8. | |
k = ceil(-log(self.fpp, 2)) | |
m = ceil(-k * self.num / log(1 - self.fpp ** (1 / k))) // -8 * -8 | |
self.numhashes = k | |
self.filtermod = m | |
filterlen = m // 8 | |
print('Number of hashes:', k) | |
print('Filter array size:', m, 'bits =', filterlen, 'bytes') | |
self.filterarray = array('B', bytes(filterlen)) | |
def make_hash_params(self): | |
hashparams = [] | |
primemod = self.primemod | |
numhashes = self.numhashes | |
while len(hashparams) < numhashes: | |
a = randrange(1, primemod) | |
b = randrange(1, primemod) | |
if a < b: | |
a, b = b, a | |
if gcd(a, b) == 1: | |
hashparams.append((a, b)) | |
self.hashparams = hashparams | |
def add(self, x): | |
''' Add a single item to the filter ''' | |
primemod = self.primemod | |
filtermod = self.filtermod | |
filterarray = self.filterarray | |
for a, b in self.hashparams: | |
y = ((a*x + b) % primemod) % filtermod | |
filterarray[y >> 3] |= 1 << (y & 7) | |
@timed | |
def update(self, data): | |
''' Add data collection to the filter ''' | |
hashparams = self.hashparams | |
primemod = self.primemod | |
filtermod = self.filtermod | |
filterarray = self.filterarray | |
for x in data: | |
for a, b in hashparams: | |
y = ((a*x + b) % primemod) % filtermod | |
filterarray[y >> 3] |= 1 << (y & 7) | |
##About 5% slower | |
#@timed | |
#def updateNEW(self, data): | |
#''' Add data collection to the filter ''' | |
#for x in data: | |
#self.add(x) | |
def bitcount(self, popcount=make_popcount()): | |
count = sum(popcount[v] for v in self.filterarray) | |
rate = count / self.filtermod | |
print('Used bits: {}, rate: {:.6f}'.format(count, rate)) | |
def __contains__(self, x): | |
''' Test if x is in the filter. This may return false | |
positives but it does not return false negatives | |
''' | |
primemod = self.primemod | |
filtermod = self.filtermod | |
filterarray = self.filterarray | |
for a, b in self.hashparams: | |
y = ((a*x + b) % primemod) % filtermod | |
if not (filterarray[y >> 3] >> (y & 7)) & 1: | |
return False | |
return True | |
@timed | |
def make_data(self): | |
''' Make a full set of random data ''' | |
primemod = self.primemod | |
return {randrange(primemod) for _ in range(self.num)} | |
def test(self, testnum): | |
print('Making data...') | |
data = self.make_data() | |
print('Adding data to filter...') | |
self.update(data) | |
self.bitcount() | |
print('Testing all the data...') | |
self.test_data(data) | |
self.test_random(data, testnum) | |
@timed | |
def test_data(self, data): | |
bad = 0 | |
for x in data: | |
if x not in self: | |
print('Missed {}!'.format(x)) | |
bad += 1 | |
if bad: | |
print(bad, 'items not found') | |
else: | |
print('All items found') | |
@timed | |
def test_random(self, data, testnum): | |
''' Measure false positive ratio by testing with random | |
numbers that aren't in the filter's data set | |
''' | |
print(testnum, 'random tests...') | |
primemod = self.primemod | |
i = count = 0 | |
while i < testnum: | |
x = randrange(primemod) | |
if x not in data: | |
i += 1 | |
if x in self: | |
count += 1 | |
p = count / i | |
print(' {} / {} = {:.3}'.format(count, i, p), | |
end='\r', flush=True) | |
print('\nFalse positives:', count, count / testnum) | |
def main(): | |
# Number of items that will be added to the filter | |
num = 100000 | |
# False positives probability | |
fpp = 1E-5 | |
seed(42) | |
print('Number of items:', num) | |
print('Desired false positive probability:', fpp) | |
bloom = BloomFilter(num, fpp) | |
bloom.test(10 * num) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment