Skip to content

Instantly share code, notes, and snippets.

@ralexx
Created May 11, 2025 01:02
Show Gist options
  • Save ralexx/092bceee0f8e5e8a45322c20b3c6097d to your computer and use it in GitHub Desktop.
Save ralexx/092bceee0f8e5e8a45322c20b3c6097d to your computer and use it in GitHub Desktop.
import hashlib
import math
import sqlite3
class BloomFilter:
"""
A Bloom filter implementation for SQLite database membership testing.
Minimizes database reads by probabilistically checking for membership
before querying the database.
"""
def __init__(self, capacity, error_rate=0.01, db_path=':memory:', table_name='bloom_table'):
"""
Initializes the Bloom filter.
Args:
capacity (int): The expected number of items to store in the filter.
error_rate (float): The desired false positive rate (default: 0.01).
db_path (str): Path to the SQLite database (default: in-memory).
table_name (str): Name of the table to store the filter data (default: bloom_table).
"""
self.capacity = capacity
self.error_rate = error_rate
self.db_path = db_path
self.table_name = table_name
self.num_bits = self.calculate_num_bits(capacity, error_rate)
self.num_hashes = self.calculate_num_hashes(self.num_bits, capacity)
self.conn = sqlite3.connect(self.db_path)
self.cursor = self.conn.cursor()
self.create_table()
def calculate_num_bits(self, capacity, error_rate):
"""
Calculates the optimal number of bits for the Bloom filter.
"""
return int(-(capacity * math.log(error_rate)) / (math.log(2) ** 2))
def calculate_num_hashes(self, num_bits, capacity):
"""
Calculates the optimal number of hash functions.
"""
return int((num_bits / capacity) * math.log(2))
def create_table(self):
"""
Creates the SQLite table to store the Bloom filter bit array.
"""
self.cursor.execute(f"""
CREATE TABLE IF NOT EXISTS {self.table_name} (
bit_index INTEGER PRIMARY KEY,
bit_value INTEGER
)
""")
self.conn.commit()
self.initialize_bits()
def initialize_bits(self):
"""
Initializes all bits in the bit array to 0.
"""
bits_to_insert = [(i, 0) for i in range(self.num_bits)]
self.cursor.executemany(f"INSERT INTO {self.table_name} (bit_index, bit_value) VALUES (?, ?)", bits_to_insert)
self.conn.commit()
def add(self, item):
"""
Adds an item to the Bloom filter.
"""
for i in range(self.num_hashes):
index = self.hash_function(item, i) % self.num_bits
self.set_bit(index, 1)
def check(self, item):
"""
Checks if an item is potentially in the Bloom filter.
"""
for i in range(self.num_hashes):
index = self.hash_function(item, i) % self.num_bits
if self.get_bit(index) == 0:
return False # Definitely not in the set
return True # Potentially in the set
def hash_function(self, item, seed):
"""
Generates a hash value for the item using a seed.
Uses SHA256 for hashing.
"""
item_str = str(item).encode('utf-8') # Ensure item is encoded as bytes
combined_str = item_str + str(seed).encode('utf-8')
return int(hashlib.sha256(combined_str).hexdigest(), 16)
def set_bit(self, index, value):
"""
Sets the bit at the given index to the specified value.
"""
self.cursor.execute(f"UPDATE {self.table_name} SET bit_value = ? WHERE bit_index = ?", (value, index))
self.conn.commit()
def get_bit(self, index):
"""
Gets the bit at the given index.
"""
self.cursor.execute(f"SELECT bit_value FROM {self.table_name} WHERE bit_index = ?", (index,))
result = self.cursor.fetchone()
return result[0] if result else 0 # Return 0 if index doesn't exist
def close(self):
"""
Closes the database connection.
"""
self.conn.close()
# Example Usage
if __name__ == '__main__':
# Create a Bloom filter with a capacity of 1000 and a 1% error rate
bloom_filter = BloomFilter(capacity=1000, error_rate=0.01)
# Add some items to the filter
items_to_add = ['apple', 'banana', 'cherry', 'date', 'fig']
for item in items_to_add:
bloom_filter.add(item)
# Check membership of some items
print("Checking membership:")
for item in ['apple', 'banana', 'grape', 'date', 'orange']:
if bloom_filter.check(item):
print(f" {item}: Potentially in the set")
else:
print(f" {item}: Definitely not in the set")
# Simulate database interaction
def check_database(item, conn):
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM my_table WHERE value = ?", (item,))
count = cursor.fetchone()[0]
return count > 0
# Create a dummy database table
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()
cursor.execute("CREATE TABLE my_table (value TEXT)")
cursor.executemany("INSERT INTO my_table (value) VALUES (?)", items_to_add)
conn.commit()
# Use Bloom filter to reduce database queries
print("\nUsing Bloom filter to reduce database queries:")
for item in ['apple', 'banana', 'grape', 'date', 'orange']:
if bloom_filter.check(item):
print(f" {item}: Checking database...")
if check_database(item, conn):
print(f" {item}: Found in database")
else:
print(f" {item}: Not found in database (false positive)")
else:
print(f" {item}: Definitely not in database (Bloom filter says so)")
bloom_filter.close()
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment