israelshirk · September 5, 2017 15:41
diff --git a/bloom.py b/bloom.py
 # -*- encoding: utf-8 -*-
 """This module implements a bloom filter probabilistic data structure and
 an a Scalable Bloom Filter that grows in size as your add more items to it
 without increasing the false positive error_rate.

 Requires the bitarray library: http://pypi.python.org/pypi/bitarray/

    >>> from pybloom import BloomFilter
    >>> f = BloomFilter(capacity=10000, error_rate=0.001)
    >>> for i in range_fn(0, f.capacity):
    ...     _ = f.add(i)
    ...
    >>> 0 in f
    True
    >>> f.capacity in f
    False
    >>> len(f) <= f.capacity
    True
    >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
    True

    >>> from pybloom import ScalableBloomFilter
    >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    >>> count = 10000
    >>> for i in range_fn(0, count):
    ...     _ = sbf.add(i)
    ...
    >>> sbf.capacity > count
    True
    >>> len(sbf) <= count
    True
    >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
    True

 """
 from __future__ import absolute_import
 import math
 from struct import unpack, pack, calcsize
 import mmh3

 import StringIO
 import cStringIO


 def range_fn(*args):
    return xrange(*args)


 def is_string_io(instance):
    return isinstance(instance, (StringIO.StringIO,
                                 cStringIO.InputType,
                                 cStringIO.OutputType))

 try:
    import bitarray
 except ImportError:
    raise ImportError('pybloom requires bitarray >= 0.3.4')

 __version__ = '2.0'
 __author__  = "Jay Baird <[email protected]>, Bob Ippolito <[email protected]>,\
               Marius Eriksen <[email protected]>,\
               Alex Brasetvik <[email protected]>,\
               Matt Bachmann <[email protected]>,\
               Israel Shirk <[email protected]> \
              "


 class BloomFilter(object):
    FILE_FMT = b'<dQQQQ'

    def __init__(self, capacity, error_rate=0.001):
        """Implements a space-efficient probabilistic data structure

        capacity
            this BloomFilter must be able to store at least *capacity* elements
            while maintaining no more than *error_rate* chance of false
            positives
        error_rate
            the error_rate of the filter returning false positives. This
            determines the filters capacity. Inserting more than capacity
            elements greatly increases the chance of false positives.

        >>> b = BloomFilter(capacity=100000, error_rate=0.001)
        >>> b.add("test")
        False
        >>> "test" in b
        True

        """
        if not (0 < error_rate < 1):
            raise ValueError("Error_Rate must be between 0 and 1.")
        if not capacity > 0:
            raise ValueError("Capacity must be > 0")
        # given M = num_bits, k = num_slices, P = error_rate, n = capacity
        #       k = log2(1/P)
        # solving for m = bits_per_slice
        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
        bits_per_slice = int(math.ceil(
            (capacity * abs(math.log(error_rate))) /
            (num_slices * (math.log(2) ** 2))))
        self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
        self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
        self.bitarray.setall(False)

    def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
        self.error_rate = error_rate
        self.num_slices = num_slices
        self.bits_per_slice = bits_per_slice
        self.capacity = capacity
        self.num_bits = num_slices * bits_per_slice
        self.count = count
        self.make_hashes = mmh3.hash

    def __contains__(self, key):
        """Tests a key's membership in this bloom filter.

        >>> b = BloomFilter(capacity=100)
        >>> b.add("hello")
        False
        >>> "hello" in b
        True

        """
        bits_per_slice = self.bits_per_slice
        bitarray = self.bitarray
        hashes = self.make_hashes(key)
        offset = 0
        for k in hashes:
            if not bitarray[offset + k]:
                return False
            offset += bits_per_slice
        return True

    def __len__(self):
        """Return the number of keys stored by this bloom filter."""
        return self.count

    def add(self, key, skip_check=False):
        """ Adds a key to this bloom filter. If the key already exists in this
        filter it will return True. Otherwise False.

        >>> b = BloomFilter(capacity=100)
        >>> b.add("hello")
        False
        >>> b.add("hello")
        True
        >>> b.count
        1

        """
        bitarray = self.bitarray
        bits_per_slice = self.bits_per_slice
        hashes = self.make_hashes(key)
        found_all_bits = True
        if self.count > self.capacity:
            raise IndexError("BloomFilter is at capacity")
        offset = 0
        for k in hashes:
            if not skip_check and found_all_bits and not bitarray[offset + k]:
                found_all_bits = False
            self.bitarray[offset + k] = True
            offset += bits_per_slice

        if skip_check:
            self.count += 1
            return False
        elif not found_all_bits:
            self.count += 1
            return False
        else:
            return True

    def copy(self):
        """Return a copy of this bloom filter.
        """
        new_filter = BloomFilter(self.capacity, self.error_rate)
        new_filter.bitarray = self.bitarray.copy()
        return new_filter

    def union(self, other):
        """ Calculates the union of the two underlying bitarrays and returns
        a new bloom filter object."""
        if self.capacity != other.capacity or \
                self.error_rate != other.error_rate:
            raise ValueError("Unioning filters requires both filters to have \
 both the same capacity and error rate")
        new_bloom = self.copy()
        new_bloom.bitarray = new_bloom.bitarray | other.bitarray
        return new_bloom

    def __or__(self, other):
        return self.union(other)

    def intersection(self, other):
        """ Calculates the intersection of the two underlying bitarrays and returns
        a new bloom filter object."""
        if self.capacity != other.capacity or \
                self.error_rate != other.error_rate:
            raise ValueError("Intersecting filters requires both filters to \
 have equal capacity and error rate")
        new_bloom = self.copy()
        new_bloom.bitarray = new_bloom.bitarray & other.bitarray
        return new_bloom

    def __and__(self, other):
        return self.intersection(other)

    def tofile(self, f):
        """Write the bloom filter to file object `f'. Underlying bits
        are written as machine values. This is much more space
        efficient than pickling the object."""
        f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
                     self.bits_per_slice, self.capacity, self.count))
        (f.write(self.bitarray.tobytes()) if is_string_io(f)
         else self.bitarray.tofile(f))

    @classmethod
    def fromfile(cls, f, n=-1):
        """Read a bloom filter from file-object `f' serialized with
        ``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
        headerlen = calcsize(cls.FILE_FMT)

        if 0 < n < headerlen:
            raise ValueError('n too small!')

        filter = cls(1)  # Bogus instantiation, we will `_setup'.
        filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
        filter.bitarray = bitarray.bitarray(endian='little')
        if n > 0:
            (filter.bitarray.frombytes(f.read(n-headerlen)) if is_string_io(f)
             else filter.bitarray.fromfile(f, n - headerlen))
        else:
            (filter.bitarray.frombytes(f.read()) if is_string_io(f)
             else filter.bitarray.fromfile(f))
        if filter.num_bits != filter.bitarray.length() and \
            (filter.num_bits + (8 - filter.num_bits % 8)
                != filter.bitarray.length()):
            raise ValueError('Bit length mismatch!')

        return filter

    def __getstate__(self):
        d = self.__dict__.copy()
        del d['make_hashes']
        return d

    def __setstate__(self, d):
        self.__dict__.update(d)
        self.make_hashes = mmh3.hash
diff --git a/LICENSE.txt b/LICENSE.txt
 Copyright (c) <2011> <Jay Baird and Bob Ippolito>

 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
 files (the "Software"), to deal in the Software without
 restriction, including without limitation the rights to use,
 copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following
 conditions:

 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
	# -- encoding: utf-8 --
	"""This module implements a bloom filter probabilistic data structure and
	an a Scalable Bloom Filter that grows in size as your add more items to it
	without increasing the false positive error_rate.

	Requires the bitarray library: http://pypi.python.org/pypi/bitarray/

	>>> from pybloom import BloomFilter
	>>> f = BloomFilter(capacity=10000, error_rate=0.001)
	>>> for i in range_fn(0, f.capacity):
	... _ = f.add(i)
	...
	>>> 0 in f
	True
	>>> f.capacity in f
	False
	>>> len(f) <= f.capacity
	True
	>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
	True

	>>> from pybloom import ScalableBloomFilter
	>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
	>>> count = 10000
	>>> for i in range_fn(0, count):
	... _ = sbf.add(i)
	...
	>>> sbf.capacity > count
	True
	>>> len(sbf) <= count
	True
	>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
	True

	"""
	from __future__ import absolute_import
	import math
	from struct import unpack, pack, calcsize
	import mmh3

	import StringIO
	import cStringIO


	def range_fn(*args):
	return xrange(*args)


	def is_string_io(instance):
	return isinstance(instance, (StringIO.StringIO,
	cStringIO.InputType,
	cStringIO.OutputType))

	try:
	import bitarray
	except ImportError:
	raise ImportError('pybloom requires bitarray >= 0.3.4')

	__version__ = '2.0'
	__author__ = "Jay Baird <[email protected]>, Bob Ippolito <[email protected]>,\
	Marius Eriksen <[email protected]>,\
	Alex Brasetvik <[email protected]>,\
	Matt Bachmann <[email protected]>,\
	Israel Shirk <[email protected]> \
	"


	class BloomFilter(object):
	FILE_FMT = b'<dQQQQ'

	def __init__(self, capacity, error_rate=0.001):
	"""Implements a space-efficient probabilistic data structure

	capacity
	this BloomFilter must be able to store at least capacity elements
	while maintaining no more than error_rate chance of false
	positives
	error_rate
	the error_rate of the filter returning false positives. This
	determines the filters capacity. Inserting more than capacity
	elements greatly increases the chance of false positives.

	>>> b = BloomFilter(capacity=100000, error_rate=0.001)
	>>> b.add("test")
	False
	>>> "test" in b
	True

	"""
	if not (0 < error_rate < 1):
	raise ValueError("Error_Rate must be between 0 and 1.")
	if not capacity > 0:
	raise ValueError("Capacity must be > 0")
	# given M = num_bits, k = num_slices, P = error_rate, n = capacity
	# k = log2(1/P)
	# solving for m = bits_per_slice
	# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
	# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
	# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
	num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
	bits_per_slice = int(math.ceil(
	(capacity * abs(math.log(error_rate))) /
	(num_slices * (math.log(2) ** 2))))
	self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
	self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
	self.bitarray.setall(False)

	def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
	self.error_rate = error_rate
	self.num_slices = num_slices
	self.bits_per_slice = bits_per_slice
	self.capacity = capacity
	self.num_bits = num_slices * bits_per_slice
	self.count = count
	self.make_hashes = mmh3.hash

	def __contains__(self, key):
	"""Tests a key's membership in this bloom filter.

	>>> b = BloomFilter(capacity=100)
	>>> b.add("hello")
	False
	>>> "hello" in b
	True

	"""
	bits_per_slice = self.bits_per_slice
	bitarray = self.bitarray
	hashes = self.make_hashes(key)
	offset = 0
	for k in hashes:
	if not bitarray[offset + k]:
	return False
	offset += bits_per_slice
	return True

	def __len__(self):
	"""Return the number of keys stored by this bloom filter."""
	return self.count

	def add(self, key, skip_check=False):
	""" Adds a key to this bloom filter. If the key already exists in this
	filter it will return True. Otherwise False.

	>>> b = BloomFilter(capacity=100)
	>>> b.add("hello")
	False
	>>> b.add("hello")
	True
	>>> b.count
	1

	"""
	bitarray = self.bitarray
	bits_per_slice = self.bits_per_slice
	hashes = self.make_hashes(key)
	found_all_bits = True
	if self.count > self.capacity:
	raise IndexError("BloomFilter is at capacity")
	offset = 0
	for k in hashes:
	if not skip_check and found_all_bits and not bitarray[offset + k]:
	found_all_bits = False
	self.bitarray[offset + k] = True
	offset += bits_per_slice

	if skip_check:
	self.count += 1
	return False
	elif not found_all_bits:
	self.count += 1
	return False
	else:
	return True

	def copy(self):
	"""Return a copy of this bloom filter.
	"""
	new_filter = BloomFilter(self.capacity, self.error_rate)
	new_filter.bitarray = self.bitarray.copy()
	return new_filter

	def union(self, other):
	""" Calculates the union of the two underlying bitarrays and returns
	a new bloom filter object."""
	if self.capacity != other.capacity or \
	self.error_rate != other.error_rate:
	raise ValueError("Unioning filters requires both filters to have \
	both the same capacity and error rate")
	new_bloom = self.copy()
	new_bloom.bitarray = new_bloom.bitarray \| other.bitarray
	return new_bloom

	def __or__(self, other):
	return self.union(other)

	def intersection(self, other):
	""" Calculates the intersection of the two underlying bitarrays and returns
	a new bloom filter object."""
	if self.capacity != other.capacity or \
	self.error_rate != other.error_rate:
	raise ValueError("Intersecting filters requires both filters to \
	have equal capacity and error rate")
	new_bloom = self.copy()
	new_bloom.bitarray = new_bloom.bitarray & other.bitarray
	return new_bloom

	def __and__(self, other):
	return self.intersection(other)

	def tofile(self, f):
	"""Write the bloom filter to file object `f'. Underlying bits
	are written as machine values. This is much more space
	efficient than pickling the object."""
	f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
	self.bits_per_slice, self.capacity, self.count))
	(f.write(self.bitarray.tobytes()) if is_string_io(f)
	else self.bitarray.tofile(f))

	@classmethod
	def fromfile(cls, f, n=-1):
	"""Read a bloom filter from file-object `f' serialized with
	``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
	headerlen = calcsize(cls.FILE_FMT)

	if 0 < n < headerlen:
	raise ValueError('n too small!')

	filter = cls(1) # Bogus instantiation, we will `_setup'.
	filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
	filter.bitarray = bitarray.bitarray(endian='little')
	if n > 0:
	(filter.bitarray.frombytes(f.read(n-headerlen)) if is_string_io(f)
	else filter.bitarray.fromfile(f, n - headerlen))
	else:
	(filter.bitarray.frombytes(f.read()) if is_string_io(f)
	else filter.bitarray.fromfile(f))
	if filter.num_bits != filter.bitarray.length() and \
	(filter.num_bits + (8 - filter.num_bits % 8)
	!= filter.bitarray.length()):
	raise ValueError('Bit length mismatch!')

	return filter

	def __getstate__(self):
	d = self.__dict__.copy()
	del d['make_hashes']
	return d

	def __setstate__(self, d):
	self.__dict__.update(d)
	self.make_hashes = mmh3.hash
	Copyright (c) <2011> <Jay Baird and Bob Ippolito>

	Permission is hereby granted, free of charge, to any person
	obtaining a copy of this software and associated documentation
	files (the "Software"), to deal in the Software without
	restriction, including without limitation the rights to use,
	copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the
	Software is furnished to do so, subject to the following
	conditions:

	The above copyright notice and this permission notice shall be
	included in all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
	OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
	HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	OTHER DEALINGS IN THE SOFTWARE.