Last active
August 29, 2015 13:59
-
-
Save msukmanowsky/10985299 to your computer and use it in GitHub Desktop.
An implementation of a Bloom Filter using Cython, still has a memory leak to debug.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ported from https://github.com/jvirkki/libbloom | |
from cpython cimport bool | |
from libc.stdlib cimport malloc, calloc, free | |
from libc.string cimport memset | |
from libc.stdio cimport printf | |
from libc.math cimport log, ceil | |
from cpython.mem cimport PyMem_Malloc, PyMem_Free | |
DEF LN2_SQUARED = 0.480453013918201 # ln(2)^2 | |
DEF LN2 = 0.693147180559945 # ln(2) | |
cdef extern from "murmur2/murmurhash2.h": | |
unsigned int murmurhash2(const void * key, int len, const unsigned int seed) | |
cdef: | |
void * PyMem_Calloc(size_t num, size_t size): | |
cdef void *p | |
cdef size_t total = num * size | |
p = PyMem_Malloc(total) | |
if p != NULL: | |
memset(p, 0, total) | |
return p | |
struct __bloomfilter: | |
int capacity | |
double error | |
int bits | |
int bytes | |
int hashes | |
double bits_per_element | |
unsigned char *bf | |
class BloomFilter: | |
cdef __bloomfilter *__bf | |
def __init__(self, capacity, error): | |
pass # no-op for signature | |
def __cinit__(self, int capacity, double error): | |
printf("Initiated with capacity=%d and error %f\n", capacity, error) | |
self.__bf = <__bloomfilter *>PyMem_Malloc(sizeof(__bloomfilter)) | |
if not self.__bf: | |
raise MemoryError("Unable to allocate memory for BloomFilter") | |
if capacity < 1: | |
raise ValueError("BloomFilter must hold at least one element") | |
if error <= 0.0 or error >= 1.0: | |
raise ValueError(("BloomFilter needs to specify positive error " | |
"between (0.0, 1.0)")) | |
self.__bf.capacity = capacity | |
self.__bf.error = error | |
cdef double num = log(self.__bf.error) | |
self.__bf.bits_per_element = -(num / LN2_SQUARED) | |
cdef double dcapacity = <double>capacity | |
self.__bf.bits = <int>(dcapacity * self.__bf.bits_per_element) | |
if self.__bf.bits % 8: | |
self.__bf.bytes = (self.__bf.bits / 8) + 1 | |
else: | |
self.__bf.bytes = self.__bf.bits / 8 | |
self.__bf.hashes = <int>ceil(LN2 * self.__bf.bits_per_element) # ln(2) | |
self.__bf.bf = <unsigned char *>PyMem_Calloc(self.__bf.bytes, sizeof(unsigned char)) | |
if not self.__bf.bf: | |
raise MemoryError("Unable to allocate memory for bloom filter") | |
def __dealloc__(self): | |
printf("dealloc called\n") | |
if self.__bf != NULL and self.__bf.bf != NULL: | |
# Still need to debug this, if we instatiate a BloomFilter but | |
# it raises TypeError, __dealloc__ is called, but we free memory | |
# that was somehow allocated (not NULL), but didn't belong to | |
# us? | |
PyMem_Free(self.__bf.bf) | |
PyMem_Free(self.__bf) # no-op if self.__bf is NULL | |
cdef int __check_or_add(self, const char *value, int should_add=1): | |
cdef int hits = 0 | |
cdef unsigned int val_len = len(value) | |
cdef unsigned int a = murmurhash2(value, val_len, 0x9747b28c) | |
cdef unsigned int b = murmurhash2(value, val_len, a) | |
cdef unsigned int x | |
cdef unsigned int i | |
cdef unsigned int byte | |
cdef unsigned int mask | |
cdef unsigned char c | |
for i in range(self.__bf.hashes): | |
x = (a + i * b) % self.__bf.bits | |
byte = x >> 3 | |
c = self.__bf.bf[byte] | |
mask = 1 << (x % 8) | |
if c & mask: | |
hits += 1 | |
else: | |
if should_add == 1: | |
self.__bf.bf[byte] = c | mask | |
if hits == self.__bf.hashes: | |
return 1 # 1 == element already in (or collision) | |
return 0 | |
def add(self, const char *value): | |
return self.__check_or_add(value, True) == 1 | |
def __contains__(self, const char *value): | |
return self.__check_or_add(value) == 1 | |
def __repr__(self): | |
return u'BloomFilter(capacity={}, error={})'.format(self.__bf.capacity, self.__bf.error) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from uuid import uuid4 | |
from cbloomfilter import BloomFilter | |
class TestCBloomFilter(unittest.TestCase): | |
def test_basic(self): | |
"""Simple test cases.""" | |
self.assertRaises(TypeError, BloomFilter, 'mike', 'mike') | |
self.assertRaises(TypeError, BloomFilter, -1, 0) | |
self.assertRaises(TypeError, BloomFilter, 10, 0) | |
bf = BloomFilter(100, 0.025) | |
self.assertEquals(bf.add('mike'), False) | |
self.assertEquals(bf.add('mike'), True) | |
self.assertIn('mike', bf) | |
def test_random(self): | |
bf = BloomFilter(100, 0.025) | |
collisions = 0 | |
for i in range(1000): | |
key = str(uuid4()) | |
if bf.add(key): | |
collisions += 1 | |
print 'Added {:,} elements and got {:,} collisions'.format(1000, | |
collisions) | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment