Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Created December 2, 2023 15:53
Show Gist options
  • Save hughdbrown/5367403afe2334423f3797a22b85d5d3 to your computer and use it in GitHub Desktop.
Save hughdbrown/5367403afe2334423f3797a22b85d5d3 to your computer and use it in GitHub Desktop.
Bloom filter implementation from medium.com article. Seems broken.
import math
import hashlib
from bitarray import bitarray
class BloomFilter:
def __init__(self, n_items, fp_prob):
'''
n_items : int
Number of items expected to be stored in bloom filter
fp_prob : float
False Positive probability in decimal
'''
# False possible probability in decimal
self.fp_prob = fp_prob
# Size of bit array to use
self.size = self.get_size(n_items,fp_prob)
print(f"{self.size = }")
# number of hash functions to use
self.hash_count = self.get_hash_count(self.size,n_items)
print(f"{self.hash_count = }")
# Bit array of given size
self.bit_array = bitarray(self.size)
# initialize all bits as 0
self.bit_array.setall(0)
def add(self, item):
'''
Add an item in the filter
'''
print(f"add({item})")
for i in range(self.hash_count):
digest = hashlib.md5(str(item).encode('utf-8'))
# perform double hashing
result = int(digest.hexdigest(), 16)
bit = result % self.size
print(f"add {bit = }")
self.bit_array[bit] = True
def check(self, item):
'''
Check for existence of an item in filter
'''
print(f"check({item})")
for i in range(self.hash_count):
digest = hashlib.md5(str(item).encode('utf-8'))
result = int(digest.hexdigest(), 16)
bit = result % self.size
print(f"check {bit = }")
if self.bit_array[bit] == False:
return False
return True
@classmethod
def get_size(self,n,p):
'''
Return the size of bit array(m) to be used
'''
print(f"get_size({n = }, {p = })")
m = -(n * math.log(p))/(math.log(2)**2)
return int(m)
@classmethod
def get_hash_count(self, m, n):
'''
Return the hash function(k) to be used
'''
print(f"get_hash_count({m = }, {n = })")
k = (m/n) * math.log(2)
print(f"{k = } {m/n = }")
return int(k)
if __name__ == '__main__':
# This has no hash functions so nothing is findable
bf = BloomFilter(1000, 0.5)
# -----
# This has multiple hash functions, but they are all the same, setting
# the same bit multiple times.
bf = BloomFilter(1000, 0.05)
bf.add(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment