2minchul · October 20, 2020 09:24
diff --git a/pyahocorasick_example.py b/pyahocorasick_example.py
 import ahocorasick  # pip install pyahocorasick

 """
 See: http://ieva.rocks/2016/11/24/keyword-matching-with-aho-corasick/
 """


 def make_aho_automaton(keywords):
    a = ahocorasick.Automaton()  # initialize
    for (key, cat) in keywords:
        a.add_word(key, (cat, key))  # add keys and categories to the trie struncture
    a.make_automaton()  # generate automaton
    return a


 def find_keywords(line, a):
    found_keywords = []
    for end_index, (cat, key) in a.iter(line):
        found_keywords.append(key)
    return found_keywords


 def find_keyword_locations(line, a):
    line_indices = [False] * len(line)
    for end_index, (cat, key) in a.iter(line):
        start_index = end_index - len(key) + 2  # start index after first space
        for i in range(start_index, end_index):  # end index excluding last space
            line_indices[i] = True
    return line_indices


 def main():
    keywords = [
        ('he', 1),
        ('she', 1),
        ('hers', 1),
        ('her', 1)
    ]
    text = [
        'he is here',
        'this is she',
        'this is hers ',
        'her bag is big'
    ]
    a = make_aho_automaton(keywords)

    print('------ no padding ---------')
    for line in text:
        print(line, ':', find_keywords(line, a))

    print('------ with padding --------')
    keywords = [
        (' he ', 1),
        (' she ', 1),
        (' hers ', 1),
        (' her ', 1)
    ]
    text = [
        ' he is here ',
        ' this is she ',
        ' this is hers ',
        ' her bag is big '
    ]
    a_spaces = make_aho_automaton(keywords)
    for line in text:
        print(line, ':', find_keywords(line, a_spaces))

    print('------ replacing/removing found keywords ---------')
    new_text_removed = []
    new_text_replaced = []
    for line in text:
        line_indices = find_keyword_locations(line, a_spaces)
        line = list(line)  # split string into list
        new_line = "".join([line[i] if not x else '' for i, x in enumerate(line_indices)])
        new_text_removed.append(new_line)
        new_line = "".join([line[i] if not x else '-' for i, x in enumerate(line_indices)])
        new_text_replaced.append(new_line)

    print(text)
    print(new_text_removed)
    print(new_text_replaced)


 if __name__ == '__main__':
    main()
	import ahocorasick # pip install pyahocorasick

	"""
	See: http://ieva.rocks/2016/11/24/keyword-matching-with-aho-corasick/
	"""


	def make_aho_automaton(keywords):
	a = ahocorasick.Automaton() # initialize
	for (key, cat) in keywords:
	a.add_word(key, (cat, key)) # add keys and categories to the trie struncture
	a.make_automaton() # generate automaton
	return a


	def find_keywords(line, a):
	found_keywords = []
	for end_index, (cat, key) in a.iter(line):
	found_keywords.append(key)
	return found_keywords


	def find_keyword_locations(line, a):
	line_indices = [False] * len(line)
	for end_index, (cat, key) in a.iter(line):
	start_index = end_index - len(key) + 2 # start index after first space
	for i in range(start_index, end_index): # end index excluding last space
	line_indices[i] = True
	return line_indices


	def main():
	keywords = [
	('he', 1),
	('she', 1),
	('hers', 1),
	('her', 1)
	]
	text = [
	'he is here',
	'this is she',
	'this is hers ',
	'her bag is big'
	]
	a = make_aho_automaton(keywords)

	print('------ no padding ---------')
	for line in text:
	print(line, ':', find_keywords(line, a))

	print('------ with padding --------')
	keywords = [
	(' he ', 1),
	(' she ', 1),
	(' hers ', 1),
	(' her ', 1)
	]
	text = [
	' he is here ',
	' this is she ',
	' this is hers ',
	' her bag is big '
	]
	a_spaces = make_aho_automaton(keywords)
	for line in text:
	print(line, ':', find_keywords(line, a_spaces))

	print('------ replacing/removing found keywords ---------')
	new_text_removed = []
	new_text_replaced = []
	for line in text:
	line_indices = find_keyword_locations(line, a_spaces)
	line = list(line) # split string into list
	new_line = "".join([line[i] if not x else '' for i, x in enumerate(line_indices)])
	new_text_removed.append(new_line)
	new_line = "".join([line[i] if not x else '-' for i, x in enumerate(line_indices)])
	new_text_replaced.append(new_line)

	print(text)
	print(new_text_removed)
	print(new_text_replaced)


	if __name__ == '__main__':
	main()