vi3k6i5 · December 12, 2017 16:04
diff --git a/gistfile1.txt b/gistfile1.txt
 import re

 class Trie():
    """Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
    The corresponding Regex should match much faster than a simple Regex union."""

    def __init__(self):
        self.data = {}

    def add(self, word):
        ref = self.data
        for char in word:
            ref[char] = char in ref and ref[char] or {}
            ref = ref[char]
        ref[''] = 1

    def dump(self):
        return self.data

    def quote(self, char):
        return re.escape(char)

    def _pattern(self, pData):
        data = pData
        if "" in data and len(data.keys()) == 1:
            return None

        alt = []
        cc = []
        q = 0
        for char in sorted(data.keys()):
            if isinstance(data[char], dict):
                try:
                    recurse = self._pattern(data[char])
                    alt.append(self.quote(char) + recurse)
                except:
                    cc.append(self.quote(char))
            else:
                q = 1
        cconly = not len(alt) > 0

        if len(cc) > 0:
            if len(cc) == 1:
                alt.append(cc[0])
            else:
                alt.append('[' + ''.join(cc) + ']')

        if len(alt) == 1:
            result = alt[0]
        else:
            result = "(?:" + "|".join(alt) + ")"

        if q:
            if cconly:
                result += "?"
            else:
                result = "(?:%s)?" % result
        return result

    def pattern(self):
        return self._pattern(self.dump())
        
 import string

 import re
 import timeit
 import random

 def get_word_of_length(str_length):
    # generate a random word of given length
    return ''.join(random.choice(string.ascii_lowercase + ' +-\\') for _ in range(str_length))

 # generate a list of 100K words of randomly chosen size
 all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]

 random.shuffle(all_words)

 def trie_regex_from_words(words):
    trie = Trie()
    for word in words:
        trie.add(word)
    return re.compile(r"\b" + trie.pattern() + r"\b", re.IGNORECASE)

 def find(word):
    def fun():
        return union.match(word)
    return fun
    
    
 #!/bin/python
 from flashtext.keyword import KeywordProcessor
 import random
 import string
 import re
 import time

 print('Count  | FlashText | Regex    ')
 print('-------------------------------')
 for keywords_length in range(0, 20001, 1000):
    # chose 5000 terms and create a string to search in.
    all_words_chosen = random.sample(all_words, 5000)
    story = ' '.join(all_words_chosen)

    # get unique keywords from the list of words generated.
    unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))

    # compile regex
    compiled_re = trie_regex_from_words(unique_keywords_sublist)

    # add keywords to flashtext
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(unique_keywords_sublist)

    # time the modules
    start = time.time()
    _ = keyword_processor.extract_keywords(story)
    mid = time.time()
    _ = compiled_re.findall(story)
    end = time.time()
    # print output
    print(str(keywords_length).ljust(6), '|',
          "{0:.5f}".format(mid - start).ljust(9), '|',
          "{0:.5f}".format(end - mid).ljust(9), '|',)
          
 Count  | FlashText | Regex    
 -------------------------------
 0      | 0.01879   | 0.00258   |
 1000   | 0.02584   | 0.01178   |
 5000   | 0.02841   | 0.01469   |
 10000  | 0.02966   | 0.01558   |
 15000  | 0.03217   | 0.01875   |
 20000  | 0.03328   | 0.01762   |
	import re

	class Trie():
	"""Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
	The corresponding Regex should match much faster than a simple Regex union."""

	def __init__(self):
	self.data = {}

	def add(self, word):
	ref = self.data
	for char in word:
	ref[char] = char in ref and ref[char] or {}
	ref = ref[char]
	ref[''] = 1

	def dump(self):
	return self.data

	def quote(self, char):
	return re.escape(char)

	def _pattern(self, pData):
	data = pData
	if "" in data and len(data.keys()) == 1:
	return None

	alt = []
	cc = []
	q = 0
	for char in sorted(data.keys()):
	if isinstance(data[char], dict):
	try:
	recurse = self._pattern(data[char])
	alt.append(self.quote(char) + recurse)
	except:
	cc.append(self.quote(char))
	else:
	q = 1
	cconly = not len(alt) > 0

	if len(cc) > 0:
	if len(cc) == 1:
	alt.append(cc[0])
	else:
	alt.append('[' + ''.join(cc) + ']')

	if len(alt) == 1:
	result = alt[0]
	else:
	result = "(?:" + "\|".join(alt) + ")"

	if q:
	if cconly:
	result += "?"
	else:
	result = "(?:%s)?" % result
	return result

	def pattern(self):
	return self._pattern(self.dump())

	import string

	import re
	import timeit
	import random

	def get_word_of_length(str_length):
	# generate a random word of given length
	return ''.join(random.choice(string.ascii_lowercase + ' +-\\') for _ in range(str_length))

	# generate a list of 100K words of randomly chosen size
	all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for i in range(100000)]

	random.shuffle(all_words)

	def trie_regex_from_words(words):
	trie = Trie()
	for word in words:
	trie.add(word)
	return re.compile(r"\b" + trie.pattern() + r"\b", re.IGNORECASE)

	def find(word):
	def fun():
	return union.match(word)
	return fun


	#!/bin/python
	from flashtext.keyword import KeywordProcessor
	import random
	import string
	import re
	import time

	print('Count \| FlashText \| Regex ')
	print('-------------------------------')
	for keywords_length in range(0, 20001, 1000):
	# chose 5000 terms and create a string to search in.
	all_words_chosen = random.sample(all_words, 5000)
	story = ' '.join(all_words_chosen)

	# get unique keywords from the list of words generated.
	unique_keywords_sublist = list(set(random.sample(all_words, keywords_length)))

	# compile regex
	compiled_re = trie_regex_from_words(unique_keywords_sublist)

	# add keywords to flashtext
	keyword_processor = KeywordProcessor()
	keyword_processor.add_keywords_from_list(unique_keywords_sublist)

	# time the modules
	start = time.time()
	_ = keyword_processor.extract_keywords(story)
	mid = time.time()
	_ = compiled_re.findall(story)
	end = time.time()
	# print output
	print(str(keywords_length).ljust(6), '\|',
	"{0:.5f}".format(mid - start).ljust(9), '\|',
	"{0:.5f}".format(end - mid).ljust(9), '\|',)

	Count \| FlashText \| Regex
	-------------------------------
	0 \| 0.01879 \| 0.00258 \|
	1000 \| 0.02584 \| 0.01178 \|
	5000 \| 0.02841 \| 0.01469 \|
	10000 \| 0.02966 \| 0.01558 \|
	15000 \| 0.03217 \| 0.01875 \|
	20000 \| 0.03328 \| 0.01762 \|
No results found