Created
December 12, 2017 16:04
-
-
Save vi3k6i5/4ea37490cddf6d8b4a1daf13f6e51457 to your computer and use it in GitHub Desktop.
Compare flashtext to Trie re
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
class Trie(): | |
"""Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern. | |
The corresponding Regex should match much faster than a simple Regex union.""" | |
def __init__(self): | |
self.data = {} | |
def add(self, word): | |
ref = self.data | |
for char in word: | |
ref[char] = char in ref and ref[char] or {} | |
ref = ref[char] | |
ref[''] = 1 | |
def dump(self): | |
return self.data | |
def quote(self, char): | |
return re.escape(char) | |
def _pattern(self, pData): | |
data = pData | |
if "" in data and len(data.keys()) == 1: | |
return None | |
alt = [] | |
cc = [] | |
q = 0 | |
for char in sorted(data.keys()): | |
if isinstance(data[char], dict): | |
try: | |
recurse = self._pattern(data[char]) | |
alt.append(self.quote(char) + recurse) | |
except: | |
cc.append(self.quote(char)) | |
else: | |
q = 1 | |
cconly = not len(alt) > 0 | |
if len(cc) > 0: | |
if len(cc) == 1: | |
alt.append(cc[0]) | |
else: | |
alt.append('[' + ''.join(cc) + ']') | |
if len(alt) == 1: | |
result = alt[0] | |
else: | |
result = "(?:" + "|".join(alt) + ")" | |
if q: | |
if cconly: | |
result += "?" | |
else: | |
result = "(?:%s)?" % result | |
return result | |
def pattern(self): | |
return self._pattern(self.dump()) | |
import string | |
import re | |
import timeit | |
import random | |
def get_word_of_length(str_length): | |
# generate a random word of given length | |
return ''.join(random.choice(string.ascii_lowercase + ' +-\\') for _ in range(str_length)) | |
# generate a list of 100K words of randomly chosen size | |
all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)] | |
random.shuffle(all_words) | |
def trie_regex_from_words(words): | |
trie = Trie() | |
for word in words: | |
trie.add(word) | |
return re.compile(r"\b" + trie.pattern() + r"\b", re.IGNORECASE) | |
def find(word): | |
def fun(): | |
return union.match(word) | |
return fun | |
#!/bin/python | |
from flashtext.keyword import KeywordProcessor | |
import random | |
import string | |
import re | |
import time | |
print('Count | FlashText | Regex ') | |
print('-------------------------------') | |
for keywords_length in range(0, 20001, 1000): | |
# chose 5000 terms and create a string to search in. | |
all_words_chosen = random.sample(all_words, 5000) | |
story = ' '.join(all_words_chosen) | |
# get unique keywords from the list of words generated. | |
unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) | |
# compile regex | |
compiled_re = trie_regex_from_words(unique_keywords_sublist) | |
# add keywords to flashtext | |
keyword_processor = KeywordProcessor() | |
keyword_processor.add_keywords_from_list(unique_keywords_sublist) | |
# time the modules | |
start = time.time() | |
_ = keyword_processor.extract_keywords(story) | |
mid = time.time() | |
_ = compiled_re.findall(story) | |
end = time.time() | |
# print output | |
print(str(keywords_length).ljust(6), '|', | |
"{0:.5f}".format(mid - start).ljust(9), '|', | |
"{0:.5f}".format(end - mid).ljust(9), '|',) | |
Count | FlashText | Regex | |
------------------------------- | |
0 | 0.01879 | 0.00258 | | |
1000 | 0.02584 | 0.01178 | | |
5000 | 0.02841 | 0.01469 | | |
10000 | 0.02966 | 0.01558 | | |
15000 | 0.03217 | 0.01875 | | |
20000 | 0.03328 | 0.01762 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment