Created
November 14, 2017 16:35
-
-
Save vi3k6i5/06b12bd2c5abde6de9dcfa43abc3d362 to your computer and use it in GitHub Desktop.
Comparing flashtext with a cython implementation of similar algo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python | |
from flashtext.keyword import KeywordProcessor | |
import random | |
import string | |
import re | |
from automaton import Automaton | |
import time | |
def get_word_of_length(str_length): | |
# generate a random word of given length | |
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length)) | |
# generate a list of 100K words of randomly chosen size | |
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] | |
print('Count | FlashText | Automaton ') | |
print('-------------------------------') | |
for keywords_length in range(0, 20001, 1000): | |
# chose 5000 terms and create a string to search in. | |
all_words_chosen = random.sample(all_words, 5000) | |
story = ' '.join(all_words_chosen) | |
# get unique keywords from the list of words generated. | |
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) | |
# compile Automaton | |
A = Automaton(ignoreAccents=True, ignoreCase=True) | |
dictionary = {} | |
for val in unique_keywords_sublist: | |
dictionary[val] = [val] | |
A.build(dictionary) | |
# add keywords to flashtext | |
keyword_processor = KeywordProcessor() | |
keyword_processor.add_keywords_from_list(unique_keywords_sublist) | |
# time the modules | |
start = time.time() | |
_ = keyword_processor.extract_keywords(story) | |
mid = time.time() | |
_ = A.read(story) | |
end = time.time() | |
# print output | |
print(str(keywords_length).ljust(6), '|', | |
"{0:.5f}".format(mid - start).ljust(9), '|', | |
"{0:.5f}".format(end - mid).ljust(9), '|',) | |
# Output: | |
# Count | FlashText | Automaton | |
# ------------------------------- | |
# 0 | 0.01584 | 0.02289 | | |
# 1000 | 0.02029 | 0.05322 | | |
# 2000 | 0.01965 | 0.05194 | | |
# 3000 | 0.02026 | 0.06139 | | |
# 4000 | 0.02118 | 0.07087 | | |
# 5000 | 0.02883 | 0.06348 | | |
# 6000 | 0.02281 | 0.12364 | | |
# 7000 | 0.02209 | 0.06058 | | |
# 8000 | 0.01994 | 0.06167 | | |
# 9000 | 0.02393 | 0.11298 | | |
# 10000 | 0.02939 | 0.07494 | | |
# 11000 | 0.02433 | 0.07365 | | |
# 12000 | 0.02576 | 0.07373 | | |
# 13000 | 0.02193 | 0.06714 | | |
# 14000 | 0.02301 | 0.07322 | | |
# 15000 | 0.02546 | 0.07115 | | |
# 16000 | 0.02495 | 0.13117 | | |
# 17000 | 0.02705 | 0.15396 | | |
# 18000 | 0.02646 | 0.08084 | | |
# 19000 | 0.02621 | 0.08861 | | |
# 20000 | 0.02399 | 0.07886 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment