Created
October 1, 2013 01:53
-
-
Save mynameisfiber/6772906 to your computer and use it in GitHub Desktop.
Counting Bloom Filter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import array | |
| import struct | |
| import math | |
| import mmh3 | |
| class CountingBloomFilter(object): | |
| def __init__(self, capacity, error=0.005, dtype="B"): | |
| self.capacity = capacity | |
| self.error = error | |
| self.num_bits = int(-capacity * math.log(error) / math.log(2)**2) + 1 | |
| self.num_hashes = int(self.num_bits / capacity * math.log(2)) + 1 | |
| self.dtype = dtype | |
| self.data = array.array(dtype, (0,) * self.num_bits) | |
| def _indexes(self, key): | |
| """ | |
| Generates the indicies corresponding to the given key | |
| """ | |
| for i in xrange(self.num_hashes): | |
| h1, h2 = mmh3.hash64(key) | |
| yield (h1 + i * h2) % self.num_bits | |
| def add(self, key, N=1): | |
| """ | |
| Adds `N` counts to the indicies given by the key | |
| """ | |
| assert isinstance(key, str) | |
| for index in self._indexes(key): | |
| self.data[index] += N | |
| return self | |
| def remove(self, key, N=1): | |
| """ | |
| Removes `N` counts to the indicies given by the key | |
| """ | |
| assert isinstance(key, str) | |
| indexes = list(self._indexes(key)) | |
| if not any(self.data[index] < N for index in indexes): | |
| for index in indexes: | |
| self.data[index] -= N | |
| return self | |
| def remove_all(self, N=1): | |
| """ | |
| Removes `N` counts to all indicies. Useful for expirations | |
| """ | |
| for i in xrange(self.num_bits): | |
| if self.data[i] >= N: | |
| self.data[i] -= N | |
| def contains(self, key): | |
| """ | |
| Check if the current bloom contains the key `key` | |
| """ | |
| assert isinstance(key, str) | |
| return all(self.data[index] != 0 for index in self._indexes(key)) | |
| def tofile(self, f): | |
| """ | |
| Writes the bloom into the given fileobject. | |
| """ | |
| header = struct.pack("QdQQc", self.capacity, self.error, self.num_bits, self.num_hashes, self.dtype) | |
| f.write(header + "\n") | |
| self.data.tofile(f) | |
| @classmethod | |
| def fromfile(cls, f): | |
| """ | |
| Reads the bloom from the given fileobject and returns the python object | |
| """ | |
| self = cls.__new__(cls) | |
| header = f.readline()[:-1] | |
| self.capacity, self.error, self.num_bits, self.num_hashes, self.dtype = struct.unpack("QdQQc", header) | |
| self.data = array.array(self.dtype) | |
| self.data.fromfile(f, self.num_bits) | |
| return self | |
| def __contains__(self, key): | |
| return self.contains(key) | |
| def __add__(self, other): | |
| return self.add(other, 1) | |
| def __sub__(self, other): | |
| return self.remove(other, 1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment