Skip to content

Instantly share code, notes, and snippets.

@israelshirk
Created September 5, 2017 15:41
Show Gist options
  • Save israelshirk/fc8e0850e83dd25c10a804ac27f31aaa to your computer and use it in GitHub Desktop.
Save israelshirk/fc8e0850e83dd25c10a804ac27f31aaa to your computer and use it in GitHub Desktop.
Modified version of jaybaird/python-bloomfilter to use mmh3 instead of more expensive cryptographic one-way functions.
# -*- encoding: utf-8 -*-
"""This module implements a bloom filter probabilistic data structure and
an a Scalable Bloom Filter that grows in size as your add more items to it
without increasing the false positive error_rate.
Requires the bitarray library: http://pypi.python.org/pypi/bitarray/
>>> from pybloom import BloomFilter
>>> f = BloomFilter(capacity=10000, error_rate=0.001)
>>> for i in range_fn(0, f.capacity):
... _ = f.add(i)
...
>>> 0 in f
True
>>> f.capacity in f
False
>>> len(f) <= f.capacity
True
>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True
>>> from pybloom import ScalableBloomFilter
>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> count = 10000
>>> for i in range_fn(0, count):
... _ = sbf.add(i)
...
>>> sbf.capacity > count
True
>>> len(sbf) <= count
True
>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
True
"""
from __future__ import absolute_import
import math
from struct import unpack, pack, calcsize
import mmh3
import StringIO
import cStringIO
def range_fn(*args):
return xrange(*args)
def is_string_io(instance):
return isinstance(instance, (StringIO.StringIO,
cStringIO.InputType,
cStringIO.OutputType))
try:
import bitarray
except ImportError:
raise ImportError('pybloom requires bitarray >= 0.3.4')
__version__ = '2.0'
__author__ = "Jay Baird <[email protected]>, Bob Ippolito <[email protected]>,\
Marius Eriksen <[email protected]>,\
Alex Brasetvik <[email protected]>,\
Matt Bachmann <[email protected]>,\
Israel Shirk <[email protected]> \
"
class BloomFilter(object):
FILE_FMT = b'<dQQQQ'
def __init__(self, capacity, error_rate=0.001):
"""Implements a space-efficient probabilistic data structure
capacity
this BloomFilter must be able to store at least *capacity* elements
while maintaining no more than *error_rate* chance of false
positives
error_rate
the error_rate of the filter returning false positives. This
determines the filters capacity. Inserting more than capacity
elements greatly increases the chance of false positives.
>>> b = BloomFilter(capacity=100000, error_rate=0.001)
>>> b.add("test")
False
>>> "test" in b
True
"""
if not (0 < error_rate < 1):
raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0:
raise ValueError("Capacity must be > 0")
# given M = num_bits, k = num_slices, P = error_rate, n = capacity
# k = log2(1/P)
# solving for m = bits_per_slice
# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
bits_per_slice = int(math.ceil(
(capacity * abs(math.log(error_rate))) /
(num_slices * (math.log(2) ** 2))))
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
self.bitarray.setall(False)
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
self.error_rate = error_rate
self.num_slices = num_slices
self.bits_per_slice = bits_per_slice
self.capacity = capacity
self.num_bits = num_slices * bits_per_slice
self.count = count
self.make_hashes = mmh3.hash
def __contains__(self, key):
"""Tests a key's membership in this bloom filter.
>>> b = BloomFilter(capacity=100)
>>> b.add("hello")
False
>>> "hello" in b
True
"""
bits_per_slice = self.bits_per_slice
bitarray = self.bitarray
hashes = self.make_hashes(key)
offset = 0
for k in hashes:
if not bitarray[offset + k]:
return False
offset += bits_per_slice
return True
def __len__(self):
"""Return the number of keys stored by this bloom filter."""
return self.count
def add(self, key, skip_check=False):
""" Adds a key to this bloom filter. If the key already exists in this
filter it will return True. Otherwise False.
>>> b = BloomFilter(capacity=100)
>>> b.add("hello")
False
>>> b.add("hello")
True
>>> b.count
1
"""
bitarray = self.bitarray
bits_per_slice = self.bits_per_slice
hashes = self.make_hashes(key)
found_all_bits = True
if self.count > self.capacity:
raise IndexError("BloomFilter is at capacity")
offset = 0
for k in hashes:
if not skip_check and found_all_bits and not bitarray[offset + k]:
found_all_bits = False
self.bitarray[offset + k] = True
offset += bits_per_slice
if skip_check:
self.count += 1
return False
elif not found_all_bits:
self.count += 1
return False
else:
return True
def copy(self):
"""Return a copy of this bloom filter.
"""
new_filter = BloomFilter(self.capacity, self.error_rate)
new_filter.bitarray = self.bitarray.copy()
return new_filter
def union(self, other):
""" Calculates the union of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Unioning filters requires both filters to have \
both the same capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
return new_bloom
def __or__(self, other):
return self.union(other)
def intersection(self, other):
""" Calculates the intersection of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Intersecting filters requires both filters to \
have equal capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
return new_bloom
def __and__(self, other):
return self.intersection(other)
def tofile(self, f):
"""Write the bloom filter to file object `f'. Underlying bits
are written as machine values. This is much more space
efficient than pickling the object."""
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
self.bits_per_slice, self.capacity, self.count))
(f.write(self.bitarray.tobytes()) if is_string_io(f)
else self.bitarray.tofile(f))
@classmethod
def fromfile(cls, f, n=-1):
"""Read a bloom filter from file-object `f' serialized with
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
headerlen = calcsize(cls.FILE_FMT)
if 0 < n < headerlen:
raise ValueError('n too small!')
filter = cls(1) # Bogus instantiation, we will `_setup'.
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
filter.bitarray = bitarray.bitarray(endian='little')
if n > 0:
(filter.bitarray.frombytes(f.read(n-headerlen)) if is_string_io(f)
else filter.bitarray.fromfile(f, n - headerlen))
else:
(filter.bitarray.frombytes(f.read()) if is_string_io(f)
else filter.bitarray.fromfile(f))
if filter.num_bits != filter.bitarray.length() and \
(filter.num_bits + (8 - filter.num_bits % 8)
!= filter.bitarray.length()):
raise ValueError('Bit length mismatch!')
return filter
def __getstate__(self):
d = self.__dict__.copy()
del d['make_hashes']
return d
def __setstate__(self, d):
self.__dict__.update(d)
self.make_hashes = mmh3.hash
Copyright (c) <2011> <Jay Baird and Bob Ippolito>
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment