Created
May 11, 2025 01:02
-
-
Save ralexx/092bceee0f8e5e8a45322c20b3c6097d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import math | |
import sqlite3 | |
class BloomFilter: | |
""" | |
A Bloom filter implementation for SQLite database membership testing. | |
Minimizes database reads by probabilistically checking for membership | |
before querying the database. | |
""" | |
def __init__(self, capacity, error_rate=0.01, db_path=':memory:', table_name='bloom_table'): | |
""" | |
Initializes the Bloom filter. | |
Args: | |
capacity (int): The expected number of items to store in the filter. | |
error_rate (float): The desired false positive rate (default: 0.01). | |
db_path (str): Path to the SQLite database (default: in-memory). | |
table_name (str): Name of the table to store the filter data (default: bloom_table). | |
""" | |
self.capacity = capacity | |
self.error_rate = error_rate | |
self.db_path = db_path | |
self.table_name = table_name | |
self.num_bits = self.calculate_num_bits(capacity, error_rate) | |
self.num_hashes = self.calculate_num_hashes(self.num_bits, capacity) | |
self.conn = sqlite3.connect(self.db_path) | |
self.cursor = self.conn.cursor() | |
self.create_table() | |
def calculate_num_bits(self, capacity, error_rate): | |
""" | |
Calculates the optimal number of bits for the Bloom filter. | |
""" | |
return int(-(capacity * math.log(error_rate)) / (math.log(2) ** 2)) | |
def calculate_num_hashes(self, num_bits, capacity): | |
""" | |
Calculates the optimal number of hash functions. | |
""" | |
return int((num_bits / capacity) * math.log(2)) | |
def create_table(self): | |
""" | |
Creates the SQLite table to store the Bloom filter bit array. | |
""" | |
self.cursor.execute(f""" | |
CREATE TABLE IF NOT EXISTS {self.table_name} ( | |
bit_index INTEGER PRIMARY KEY, | |
bit_value INTEGER | |
) | |
""") | |
self.conn.commit() | |
self.initialize_bits() | |
def initialize_bits(self): | |
""" | |
Initializes all bits in the bit array to 0. | |
""" | |
bits_to_insert = [(i, 0) for i in range(self.num_bits)] | |
self.cursor.executemany(f"INSERT INTO {self.table_name} (bit_index, bit_value) VALUES (?, ?)", bits_to_insert) | |
self.conn.commit() | |
def add(self, item): | |
""" | |
Adds an item to the Bloom filter. | |
""" | |
for i in range(self.num_hashes): | |
index = self.hash_function(item, i) % self.num_bits | |
self.set_bit(index, 1) | |
def check(self, item): | |
""" | |
Checks if an item is potentially in the Bloom filter. | |
""" | |
for i in range(self.num_hashes): | |
index = self.hash_function(item, i) % self.num_bits | |
if self.get_bit(index) == 0: | |
return False # Definitely not in the set | |
return True # Potentially in the set | |
def hash_function(self, item, seed): | |
""" | |
Generates a hash value for the item using a seed. | |
Uses SHA256 for hashing. | |
""" | |
item_str = str(item).encode('utf-8') # Ensure item is encoded as bytes | |
combined_str = item_str + str(seed).encode('utf-8') | |
return int(hashlib.sha256(combined_str).hexdigest(), 16) | |
def set_bit(self, index, value): | |
""" | |
Sets the bit at the given index to the specified value. | |
""" | |
self.cursor.execute(f"UPDATE {self.table_name} SET bit_value = ? WHERE bit_index = ?", (value, index)) | |
self.conn.commit() | |
def get_bit(self, index): | |
""" | |
Gets the bit at the given index. | |
""" | |
self.cursor.execute(f"SELECT bit_value FROM {self.table_name} WHERE bit_index = ?", (index,)) | |
result = self.cursor.fetchone() | |
return result[0] if result else 0 # Return 0 if index doesn't exist | |
def close(self): | |
""" | |
Closes the database connection. | |
""" | |
self.conn.close() | |
# Example Usage | |
if __name__ == '__main__': | |
# Create a Bloom filter with a capacity of 1000 and a 1% error rate | |
bloom_filter = BloomFilter(capacity=1000, error_rate=0.01) | |
# Add some items to the filter | |
items_to_add = ['apple', 'banana', 'cherry', 'date', 'fig'] | |
for item in items_to_add: | |
bloom_filter.add(item) | |
# Check membership of some items | |
print("Checking membership:") | |
for item in ['apple', 'banana', 'grape', 'date', 'orange']: | |
if bloom_filter.check(item): | |
print(f" {item}: Potentially in the set") | |
else: | |
print(f" {item}: Definitely not in the set") | |
# Simulate database interaction | |
def check_database(item, conn): | |
cursor = conn.cursor() | |
cursor.execute("SELECT COUNT(*) FROM my_table WHERE value = ?", (item,)) | |
count = cursor.fetchone()[0] | |
return count > 0 | |
# Create a dummy database table | |
conn = sqlite3.connect(':memory:') | |
cursor = conn.cursor() | |
cursor.execute("CREATE TABLE my_table (value TEXT)") | |
cursor.executemany("INSERT INTO my_table (value) VALUES (?)", items_to_add) | |
conn.commit() | |
# Use Bloom filter to reduce database queries | |
print("\nUsing Bloom filter to reduce database queries:") | |
for item in ['apple', 'banana', 'grape', 'date', 'orange']: | |
if bloom_filter.check(item): | |
print(f" {item}: Checking database...") | |
if check_database(item, conn): | |
print(f" {item}: Found in database") | |
else: | |
print(f" {item}: Not found in database (false positive)") | |
else: | |
print(f" {item}: Definitely not in database (Bloom filter says so)") | |
bloom_filter.close() | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment