Created
September 5, 2017 15:41
-
-
Save israelshirk/fc8e0850e83dd25c10a804ac27f31aaa to your computer and use it in GitHub Desktop.
Modified version of jaybaird/python-bloomfilter to use mmh3 instead of more expensive cryptographic one-way functions.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
"""This module implements a bloom filter probabilistic data structure and | |
an a Scalable Bloom Filter that grows in size as your add more items to it | |
without increasing the false positive error_rate. | |
Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ | |
>>> from pybloom import BloomFilter | |
>>> f = BloomFilter(capacity=10000, error_rate=0.001) | |
>>> for i in range_fn(0, f.capacity): | |
... _ = f.add(i) | |
... | |
>>> 0 in f | |
True | |
>>> f.capacity in f | |
False | |
>>> len(f) <= f.capacity | |
True | |
>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 | |
True | |
>>> from pybloom import ScalableBloomFilter | |
>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) | |
>>> count = 10000 | |
>>> for i in range_fn(0, count): | |
... _ = sbf.add(i) | |
... | |
>>> sbf.capacity > count | |
True | |
>>> len(sbf) <= count | |
True | |
>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 | |
True | |
""" | |
from __future__ import absolute_import | |
import math | |
from struct import unpack, pack, calcsize | |
import mmh3 | |
import StringIO | |
import cStringIO | |
def range_fn(*args): | |
return xrange(*args) | |
def is_string_io(instance): | |
return isinstance(instance, (StringIO.StringIO, | |
cStringIO.InputType, | |
cStringIO.OutputType)) | |
try: | |
import bitarray | |
except ImportError: | |
raise ImportError('pybloom requires bitarray >= 0.3.4') | |
__version__ = '2.0' | |
__author__ = "Jay Baird <[email protected]>, Bob Ippolito <[email protected]>,\ | |
Marius Eriksen <[email protected]>,\ | |
Alex Brasetvik <[email protected]>,\ | |
Matt Bachmann <[email protected]>,\ | |
Israel Shirk <[email protected]> \ | |
" | |
class BloomFilter(object): | |
FILE_FMT = b'<dQQQQ' | |
def __init__(self, capacity, error_rate=0.001): | |
"""Implements a space-efficient probabilistic data structure | |
capacity | |
this BloomFilter must be able to store at least *capacity* elements | |
while maintaining no more than *error_rate* chance of false | |
positives | |
error_rate | |
the error_rate of the filter returning false positives. This | |
determines the filters capacity. Inserting more than capacity | |
elements greatly increases the chance of false positives. | |
>>> b = BloomFilter(capacity=100000, error_rate=0.001) | |
>>> b.add("test") | |
False | |
>>> "test" in b | |
True | |
""" | |
if not (0 < error_rate < 1): | |
raise ValueError("Error_Rate must be between 0 and 1.") | |
if not capacity > 0: | |
raise ValueError("Capacity must be > 0") | |
# given M = num_bits, k = num_slices, P = error_rate, n = capacity | |
# k = log2(1/P) | |
# solving for m = bits_per_slice | |
# n ~= M * ((ln(2) ** 2) / abs(ln(P))) | |
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) | |
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) | |
num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) | |
bits_per_slice = int(math.ceil( | |
(capacity * abs(math.log(error_rate))) / | |
(num_slices * (math.log(2) ** 2)))) | |
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) | |
self.bitarray = bitarray.bitarray(self.num_bits, endian='little') | |
self.bitarray.setall(False) | |
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): | |
self.error_rate = error_rate | |
self.num_slices = num_slices | |
self.bits_per_slice = bits_per_slice | |
self.capacity = capacity | |
self.num_bits = num_slices * bits_per_slice | |
self.count = count | |
self.make_hashes = mmh3.hash | |
def __contains__(self, key): | |
"""Tests a key's membership in this bloom filter. | |
>>> b = BloomFilter(capacity=100) | |
>>> b.add("hello") | |
False | |
>>> "hello" in b | |
True | |
""" | |
bits_per_slice = self.bits_per_slice | |
bitarray = self.bitarray | |
hashes = self.make_hashes(key) | |
offset = 0 | |
for k in hashes: | |
if not bitarray[offset + k]: | |
return False | |
offset += bits_per_slice | |
return True | |
def __len__(self): | |
"""Return the number of keys stored by this bloom filter.""" | |
return self.count | |
def add(self, key, skip_check=False): | |
""" Adds a key to this bloom filter. If the key already exists in this | |
filter it will return True. Otherwise False. | |
>>> b = BloomFilter(capacity=100) | |
>>> b.add("hello") | |
False | |
>>> b.add("hello") | |
True | |
>>> b.count | |
1 | |
""" | |
bitarray = self.bitarray | |
bits_per_slice = self.bits_per_slice | |
hashes = self.make_hashes(key) | |
found_all_bits = True | |
if self.count > self.capacity: | |
raise IndexError("BloomFilter is at capacity") | |
offset = 0 | |
for k in hashes: | |
if not skip_check and found_all_bits and not bitarray[offset + k]: | |
found_all_bits = False | |
self.bitarray[offset + k] = True | |
offset += bits_per_slice | |
if skip_check: | |
self.count += 1 | |
return False | |
elif not found_all_bits: | |
self.count += 1 | |
return False | |
else: | |
return True | |
def copy(self): | |
"""Return a copy of this bloom filter. | |
""" | |
new_filter = BloomFilter(self.capacity, self.error_rate) | |
new_filter.bitarray = self.bitarray.copy() | |
return new_filter | |
def union(self, other): | |
""" Calculates the union of the two underlying bitarrays and returns | |
a new bloom filter object.""" | |
if self.capacity != other.capacity or \ | |
self.error_rate != other.error_rate: | |
raise ValueError("Unioning filters requires both filters to have \ | |
both the same capacity and error rate") | |
new_bloom = self.copy() | |
new_bloom.bitarray = new_bloom.bitarray | other.bitarray | |
return new_bloom | |
def __or__(self, other): | |
return self.union(other) | |
def intersection(self, other): | |
""" Calculates the intersection of the two underlying bitarrays and returns | |
a new bloom filter object.""" | |
if self.capacity != other.capacity or \ | |
self.error_rate != other.error_rate: | |
raise ValueError("Intersecting filters requires both filters to \ | |
have equal capacity and error rate") | |
new_bloom = self.copy() | |
new_bloom.bitarray = new_bloom.bitarray & other.bitarray | |
return new_bloom | |
def __and__(self, other): | |
return self.intersection(other) | |
def tofile(self, f): | |
"""Write the bloom filter to file object `f'. Underlying bits | |
are written as machine values. This is much more space | |
efficient than pickling the object.""" | |
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, | |
self.bits_per_slice, self.capacity, self.count)) | |
(f.write(self.bitarray.tobytes()) if is_string_io(f) | |
else self.bitarray.tofile(f)) | |
@classmethod | |
def fromfile(cls, f, n=-1): | |
"""Read a bloom filter from file-object `f' serialized with | |
``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" | |
headerlen = calcsize(cls.FILE_FMT) | |
if 0 < n < headerlen: | |
raise ValueError('n too small!') | |
filter = cls(1) # Bogus instantiation, we will `_setup'. | |
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) | |
filter.bitarray = bitarray.bitarray(endian='little') | |
if n > 0: | |
(filter.bitarray.frombytes(f.read(n-headerlen)) if is_string_io(f) | |
else filter.bitarray.fromfile(f, n - headerlen)) | |
else: | |
(filter.bitarray.frombytes(f.read()) if is_string_io(f) | |
else filter.bitarray.fromfile(f)) | |
if filter.num_bits != filter.bitarray.length() and \ | |
(filter.num_bits + (8 - filter.num_bits % 8) | |
!= filter.bitarray.length()): | |
raise ValueError('Bit length mismatch!') | |
return filter | |
def __getstate__(self): | |
d = self.__dict__.copy() | |
del d['make_hashes'] | |
return d | |
def __setstate__(self, d): | |
self.__dict__.update(d) | |
self.make_hashes = mmh3.hash |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Copyright (c) <2011> <Jay Baird and Bob Ippolito> | |
Permission is hereby granted, free of charge, to any person | |
obtaining a copy of this software and associated documentation | |
files (the "Software"), to deal in the Software without | |
restriction, including without limitation the rights to use, | |
copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the | |
Software is furnished to do so, subject to the following | |
conditions: | |
The above copyright notice and this permission notice shall be | |
included in all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
OTHER DEALINGS IN THE SOFTWARE. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment