Created
August 3, 2017 20:10
-
-
Save hiway/2b7f5c795e9fd28ce33ffa5ae90fbfa8 to your computer and use it in GitHub Desktop.
A lightweight bloom filter for Micropython and Python 3 with a tiny memory footprint.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
try: | |
import urandom as _random | |
import uhashlib as _hashlib | |
import ubinascii as _binascii | |
except ImportError: | |
import random as _random | |
import hashlib as _hashlib | |
import binascii as _binascii | |
class BloomFilter: | |
""" | |
A lightweight bloom filter for Micropython and Python 3 with a tiny memory footprint. | |
Reference: | |
- http://code.activestate.com/recipes/577684-bloom-filter/ | |
- http://maciejczyzewski.me/2014/10/18/bloom-filters-fast-and-simple.html | |
""" | |
def __init__(self, size: int = 128, probes: int = 3, iterable: tuple = ()): | |
self.array = bytearray(size) | |
self._bit_count = size * 8 # each byte in the bytearray = 8 bits | |
self._probes = probes | |
self.update(iterable) | |
def __contains__(self, key): | |
return all(self.array[i // 8] & (2 ** (i % 8)) for i in self._get_probes(key)) | |
def _get_probes(self, key: str): | |
digest = int(_binascii.hexlify(_hashlib.sha256(key.encode()).digest()), 16) | |
for _ in range(self._probes): | |
# yield h & ((2 ** 13) - 1) | |
# yield h & 8191 | |
yield digest & self._bit_count - 1 | |
digest >>= 256 // self._probes | |
def add(self, key: str): | |
for i in self._get_probes(key): | |
self.array[i // 8] |= 2 ** (i % 8) | |
def update(self, keys): | |
for key in keys: | |
self.add(key) | |
if __name__ == '__main__': | |
positives = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut | |
Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas | |
Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota | |
Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey | |
NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon | |
Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah | |
Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split() | |
negatives = """lkjdhf lkdjh djk gjhd fhv jdhbfjdbflkj ad flakjdf kldajf kdjfn dkjn | |
fdkj fkdjb fdjbf jabd jkh hja dvfjhabd jsbdjhas bdjhab sdjbasdjahbs | |
dhjasv dhv dks78t3 4uy3g beg f8yg u3hb433fi 7ku1bi76f3r vgqdhbbfi76erf | |
kjh 37wo2jo3kdkg i3yig iu 3geakjge3h kjdh uy eiu hejnkhfd hdofjdkjfld""".split() | |
bf = BloomFilter(iterable=positives) | |
m = sum(p in bf for p in positives) | |
print('%d true positives and %d false negatives out of %d positive trials' | |
% (m, len(positives) - m, len(positives))) | |
m = sum(b not in bf for b in negatives) | |
print('%d true negatives and %d false positives out of %d negative trials' | |
% (m, len(negatives) - m, len(negatives))) | |
try: | |
from random import sample | |
from string import ascii_letters | |
trials = 100000 | |
m = sum(''.join(sample(ascii_letters, 8)) in bf for i in range(trials)) | |
print('%d true negatives and %d false positives out of %d negative trials' | |
% (trials - m, m, trials)) | |
print(100 - (((trials - m) / trials) * 100)) | |
except ImportError: # fails on micropython | |
pass | |
c = ''.join('08b'.format(x) for x in bf.array) | |
print('Bit density:', c.count('1') / float(len(c))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment