Created
January 21, 2019 18:13
-
-
Save fjsj/f7f544ebcedb1ad931a4d31cdc9d2fb5 to your computer and use it in GitHub Desktop.
Python bloom filters performance test (based on pybloomfilter's own test)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import os | |
import tempfile | |
import time | |
import timeit | |
import pybloomfilter | |
tempfiles = [] | |
ERROR_RATE = 0.01 | |
NUM_K = 7 # https://hur.st/bloomfilter/?n=97070&p=0.01&m=&k= | |
#def get_and_add_words(Creator, wordlist): | |
def get_and_add_words(Creator, wordlist): | |
bf = Creator(len(wordlist), ERROR_RATE) | |
if isinstance(bf, pybloomfilter.BloomFilter): | |
add_method = lambda word: bf.add(word.encode('utf-8')) | |
elif isinstance(bf, ScalableCuckooFilter): | |
add_method = bf.insert | |
else: | |
add_method = bf.add | |
for word in wordlist: | |
add_method(word) | |
return bf | |
def check_words(bf, wordlist): | |
if isinstance(bf, pybloomfilter.BloomFilter): | |
contains_method = lambda word: word.encode('utf-8') in bf | |
elif isinstance(bf, ScalableCuckooFilter): | |
contains_method = bf.contains | |
# elif isinstance(bf, pyreBloom.pyreBloom): | |
# contains_method = bf.contains | |
else: | |
contains_method = bf.__contains__ | |
for word in wordlist: | |
contains_method(word) | |
def test_errors(bf, correct_wordlist, test_wordlist): | |
errors = [0, 0] | |
if isinstance(bf, pybloomfilter.BloomFilter): | |
contains_method = lambda word: word.encode('utf-8') in bf | |
elif isinstance(bf, ScalableCuckooFilter): | |
contains_method = bf.contains | |
else: | |
contains_method = bf.__contains__ | |
for word in test_wordlist: | |
word = word | |
if contains_method(word): | |
if word not in correct_wordlist: | |
errors[0] += 1 | |
else: | |
if word in correct_wordlist: | |
errors[1] += 1 | |
print('%0.2f%% positive %0.2f%% negative' % ( | |
errors[0] / float(len(correct_wordlist)) * 100, | |
errors[1] / float(len(correct_wordlist)) * 100)) | |
def create_word_list(filename): | |
f = open(filename, 'r') | |
words_set = set() | |
for line in f: | |
line = line.strip().lower() | |
if line: | |
words_set.add(line) | |
f.close() | |
return words_set | |
def create_cbloomfilter(*args): | |
args = list(args) | |
f = tempfile.NamedTemporaryFile() | |
tempfiles.append(f) | |
os.unlink(f.name) | |
args.append(f.name.encode('utf-8')) | |
return pybloomfilter.BloomFilter(*tuple(args)) | |
creators = [create_cbloomfilter] | |
import pybloom_live | |
creators.append(pybloom_live.BloomFilter) | |
from cuckoo.filter import ScalableCuckooFilter | |
creators.append(ScalableCuckooFilter) | |
import pybloof | |
creators.append(lambda words_len, error_rate: pybloof.StringBloomFilter(words_len, NUM_K)) | |
# import pyreBloom | |
# creators.append(lambda words_len, error_rate: pyreBloom.pyreBloom('myBloomFilter'.encode('utf-8'), words_len, error_rate)) | |
import mmh3 | |
from probables.hashes import hash_with_depth_bytes | |
@hash_with_depth_bytes | |
def my_hash(key): | |
return mmh3.hash_bytes(key) | |
import probables | |
creators.append(lambda words_len, error_rate: probables.BloomFilter(words_len, error_rate, hash_function=my_hash)) | |
creators = creators[::-1] | |
def run_test(): | |
dict_wordlist = create_word_list('words') | |
test_wordlist = create_word_list('testwords') | |
NUM = 10 | |
for creator in creators: | |
start = time.time() | |
if NUM: | |
t = timeit.Timer(lambda : get_and_add_words(creator, dict_wordlist)) | |
print("%s took %0.5f s/run" % ( | |
creator, | |
t.timeit(NUM) / float(NUM))) | |
bf = get_and_add_words(creator, dict_wordlist) | |
if NUM: | |
t = timeit.Timer(lambda : check_words(bf, test_wordlist)) | |
print("%s took %0.5f s/run" % ( | |
creator, | |
t.timeit(NUM) / float(NUM))) | |
test_errors(bf, dict_wordlist, test_wordlist) | |
print('--------------------') | |
if __name__ == "__main__": | |
run_test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment