Skip to content

Instantly share code, notes, and snippets.

@msukmanowsky
Last active August 29, 2015 13:59
Show Gist options
  • Save msukmanowsky/10985299 to your computer and use it in GitHub Desktop.
Save msukmanowsky/10985299 to your computer and use it in GitHub Desktop.
An implementation of a Bloom Filter using Cython, still has a memory leak to debug.
# ported from https://github.com/jvirkki/libbloom
from cpython cimport bool
from libc.stdlib cimport malloc, calloc, free
from libc.string cimport memset
from libc.stdio cimport printf
from libc.math cimport log, ceil
from cpython.mem cimport PyMem_Malloc, PyMem_Free
DEF LN2_SQUARED = 0.480453013918201 # ln(2)^2
DEF LN2 = 0.693147180559945 # ln(2)
cdef extern from "murmur2/murmurhash2.h":
unsigned int murmurhash2(const void * key, int len, const unsigned int seed)
cdef:
void * PyMem_Calloc(size_t num, size_t size):
cdef void *p
cdef size_t total = num * size
p = PyMem_Malloc(total)
if p != NULL:
memset(p, 0, total)
return p
struct __bloomfilter:
int capacity
double error
int bits
int bytes
int hashes
double bits_per_element
unsigned char *bf
class BloomFilter:
cdef __bloomfilter *__bf
def __init__(self, capacity, error):
pass # no-op for signature
def __cinit__(self, int capacity, double error):
printf("Initiated with capacity=%d and error %f\n", capacity, error)
self.__bf = <__bloomfilter *>PyMem_Malloc(sizeof(__bloomfilter))
if not self.__bf:
raise MemoryError("Unable to allocate memory for BloomFilter")
if capacity < 1:
raise ValueError("BloomFilter must hold at least one element")
if error <= 0.0 or error >= 1.0:
raise ValueError(("BloomFilter needs to specify positive error "
"between (0.0, 1.0)"))
self.__bf.capacity = capacity
self.__bf.error = error
cdef double num = log(self.__bf.error)
self.__bf.bits_per_element = -(num / LN2_SQUARED)
cdef double dcapacity = <double>capacity
self.__bf.bits = <int>(dcapacity * self.__bf.bits_per_element)
if self.__bf.bits % 8:
self.__bf.bytes = (self.__bf.bits / 8) + 1
else:
self.__bf.bytes = self.__bf.bits / 8
self.__bf.hashes = <int>ceil(LN2 * self.__bf.bits_per_element) # ln(2)
self.__bf.bf = <unsigned char *>PyMem_Calloc(self.__bf.bytes, sizeof(unsigned char))
if not self.__bf.bf:
raise MemoryError("Unable to allocate memory for bloom filter")
def __dealloc__(self):
printf("dealloc called\n")
if self.__bf != NULL and self.__bf.bf != NULL:
# Still need to debug this, if we instatiate a BloomFilter but
# it raises TypeError, __dealloc__ is called, but we free memory
# that was somehow allocated (not NULL), but didn't belong to
# us?
PyMem_Free(self.__bf.bf)
PyMem_Free(self.__bf) # no-op if self.__bf is NULL
cdef int __check_or_add(self, const char *value, int should_add=1):
cdef int hits = 0
cdef unsigned int val_len = len(value)
cdef unsigned int a = murmurhash2(value, val_len, 0x9747b28c)
cdef unsigned int b = murmurhash2(value, val_len, a)
cdef unsigned int x
cdef unsigned int i
cdef unsigned int byte
cdef unsigned int mask
cdef unsigned char c
for i in range(self.__bf.hashes):
x = (a + i * b) % self.__bf.bits
byte = x >> 3
c = self.__bf.bf[byte]
mask = 1 << (x % 8)
if c & mask:
hits += 1
else:
if should_add == 1:
self.__bf.bf[byte] = c | mask
if hits == self.__bf.hashes:
return 1 # 1 == element already in (or collision)
return 0
def add(self, const char *value):
return self.__check_or_add(value, True) == 1
def __contains__(self, const char *value):
return self.__check_or_add(value) == 1
def __repr__(self):
return u'BloomFilter(capacity={}, error={})'.format(self.__bf.capacity, self.__bf.error)
import unittest
from uuid import uuid4
from cbloomfilter import BloomFilter
class TestCBloomFilter(unittest.TestCase):
def test_basic(self):
"""Simple test cases."""
self.assertRaises(TypeError, BloomFilter, 'mike', 'mike')
self.assertRaises(TypeError, BloomFilter, -1, 0)
self.assertRaises(TypeError, BloomFilter, 10, 0)
bf = BloomFilter(100, 0.025)
self.assertEquals(bf.add('mike'), False)
self.assertEquals(bf.add('mike'), True)
self.assertIn('mike', bf)
def test_random(self):
bf = BloomFilter(100, 0.025)
collisions = 0
for i in range(1000):
key = str(uuid4())
if bf.add(key):
collisions += 1
print 'Added {:,} elements and got {:,} collisions'.format(1000,
collisions)
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment