-
-
Save vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a to your computer and use it in GitHub Desktop.
#!/bin/python | |
from flashtext.keyword import KeywordProcessor | |
import random | |
import string | |
import re | |
import time | |
def get_word_of_length(str_length): | |
# generate a random word of given length | |
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length)) | |
# generate a list of 100K words of randomly chosen size | |
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] | |
print('Count | FlashText | Regex ') | |
print('-------------------------------') | |
for keywords_length in range(1, 20002, 1000): | |
# chose 5000 terms and create a string to search in. | |
all_words_chosen = random.sample(all_words, 5000) | |
story = ' '.join(all_words_chosen) | |
# get unique keywords from the list of words generated. | |
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) | |
# compile regex | |
# source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings | |
rep = dict([(key, '_keyword_') for key in unique_keywords_sublist]) | |
compiled_re = re.compile("|".join(rep.keys())) | |
# add keywords to flashtext | |
keyword_processor = KeywordProcessor() | |
for keyword in unique_keywords_sublist: | |
keyword_processor.add_keyword(keyword, '_keyword_') | |
# time the modules | |
start = time.time() | |
_ = keyword_processor.replace_keywords(story) | |
mid = time.time() | |
_ = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story) | |
end = time.time() | |
# print output | |
print(str(keywords_length).ljust(6), '|', | |
"{0:.5f}".format(mid - start).ljust(9), '|', | |
"{0:.5f}".format(end - mid).ljust(9), '|',) | |
# Count | FlashText | Regex | |
# ------------------------------- | |
# 1 | 0.02141 | 0.00004 | | |
# 1001 | 0.02498 | 0.13180 | | |
# 5001 | 0.03147 | 0.59799 | | |
# 10001 | 0.02858 | 1.08717 | | |
# 15001 | 0.02734 | 1.51461 | | |
# 20001 | 0.03109 | 1.76158 | |
am i missing something or is there a reason not to compare vs a dictionary based replace, which seems to run an order of magnitude faster than flashtext on this benchmark
replaced = ' '.join([rep.get(word, word) for word in story.split(" ")])
Count | FlashText | Dict
1 | 0.00777 | 0.00093 |
1001 | 0.00884 | 0.00108 |
2001 | 0.00911 | 0.00109 |
3001 | 0.00919 | 0.00109 |
4001 | 0.00928 | 0.00110 |
5001 | 0.00941 | 0.00112 |
6001 | 0.00957 | 0.00130 |
7001 | 0.00960 | 0.00126 |
8001 | 0.00970 | 0.00127 |
9001 | 0.00985 | 0.00124 |
10001 | 0.00999 | 0.00125 |
11001 | 0.01020 | 0.00123 |
12001 | 0.01044 | 0.00143 |
13001 | 0.01028 | 0.00139 |
14001 | 0.01030 | 0.00139 |
15001 | 0.01010 | 0.00131 |
16001 | 0.01066 | 0.00135 |
17001 | 0.01074 | 0.00132 |
18001 | 0.01061 | 0.00133 |
19001 | 0.01085 | 0.00137 |
20001 | 0.01078 | 0.00134 |
Completely off-topic: your shebang should be #!/usr/bin/env python3
The above commented example will never find the longest match and replace it. So, don't get the wrong idea.! For find and replace, Flashtext is correct choice.!
It doesn't get you in to "comparable to FlashText" territory, but an easy regex optimisation to apply is to sort the keys before adding to the regex.
The speed improvement is quite varied depending on the keys chosen, but it's almost always a safe thing to do.