Skip to content

Instantly share code, notes, and snippets.

@2minchul
Created October 20, 2020 09:24
Show Gist options
  • Save 2minchul/610b7031a4839dcdc105b4d1e50fca07 to your computer and use it in GitHub Desktop.
Save 2minchul/610b7031a4839dcdc105b4d1e50fca07 to your computer and use it in GitHub Desktop.
pyahocorasick example
import ahocorasick # pip install pyahocorasick
"""
See: http://ieva.rocks/2016/11/24/keyword-matching-with-aho-corasick/
"""
def make_aho_automaton(keywords):
a = ahocorasick.Automaton() # initialize
for (key, cat) in keywords:
a.add_word(key, (cat, key)) # add keys and categories to the trie struncture
a.make_automaton() # generate automaton
return a
def find_keywords(line, a):
found_keywords = []
for end_index, (cat, key) in a.iter(line):
found_keywords.append(key)
return found_keywords
def find_keyword_locations(line, a):
line_indices = [False] * len(line)
for end_index, (cat, key) in a.iter(line):
start_index = end_index - len(key) + 2 # start index after first space
for i in range(start_index, end_index): # end index excluding last space
line_indices[i] = True
return line_indices
def main():
keywords = [
('he', 1),
('she', 1),
('hers', 1),
('her', 1)
]
text = [
'he is here',
'this is she',
'this is hers ',
'her bag is big'
]
a = make_aho_automaton(keywords)
print('------ no padding ---------')
for line in text:
print(line, ':', find_keywords(line, a))
print('------ with padding --------')
keywords = [
(' he ', 1),
(' she ', 1),
(' hers ', 1),
(' her ', 1)
]
text = [
' he is here ',
' this is she ',
' this is hers ',
' her bag is big '
]
a_spaces = make_aho_automaton(keywords)
for line in text:
print(line, ':', find_keywords(line, a_spaces))
print('------ replacing/removing found keywords ---------')
new_text_removed = []
new_text_replaced = []
for line in text:
line_indices = find_keyword_locations(line, a_spaces)
line = list(line) # split string into list
new_line = "".join([line[i] if not x else '' for i, x in enumerate(line_indices)])
new_text_removed.append(new_line)
new_line = "".join([line[i] if not x else '-' for i, x in enumerate(line_indices)])
new_text_replaced.append(new_line)
print(text)
print(new_text_removed)
print(new_text_replaced)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment