Last active
November 26, 2018 02:19
-
-
Save RaphaelAslanian/5c9e5b38d4e1fb2280a53e0b5ffe4c3b to your computer and use it in GitHub Desktop.
Implementation of the Knuth-Morris-Pratt algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This module contains an implementation of the Knuth-Morris-Pratt algorithm allowing to look for sequence matching. | |
For example if you want to find the sequence [1, 2, 3] in [2, 3, 1, 2, 3, 5, 6], you could do: | |
find_sequence([2, 3, 1, 2, 3, 5, 6], [1, 2, 3]) | |
This algorithm is especially interesting in case of many partial matches. Otherwise, you could simply use a brute | |
force search with correct performances. | |
The main function to use here is find_sequence which actually performs the KMP algorithm. | |
You will find documentation on this algorithm here: | |
https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm | |
""" | |
import logging | |
def find_sequence(input_list, seq_to_find): | |
""" | |
This function computes whether or not a sequence is contained in another using the Knuth–Morris–Pratt algorithm. | |
:param input_list: iterable into which to look for a certain sequence | |
:param seq_to_find: iterable holding the sequence to seek | |
:return: True if the sequence was found, False otherwise. | |
""" | |
try: | |
kmp_table = build_kmp_table(seq_to_find) | |
return apply_kmp_algo(input_list, seq_to_find, kmp_table) | |
except IndexError: | |
logging.error("The algorithm encountered an index error. " | |
"Please make sure the iterables have at least one element.") | |
except TypeError: | |
logging.error("The algorithm encountered a type error. Please make sure the arguments passed are iterable.") | |
def build_kmp_table(sequence): | |
""" | |
Builds the KMP table for a specific sequence. | |
This table will then be used in the KMP algorithm (see apply_kmp_algo). | |
:param sequence: iterable for which to construct the KMP table. | |
:return: the KMP table associated with the input sequence. | |
""" | |
kmp_table = [-1] * len(sequence) | |
cnd = 0 | |
for pos in range(1, len(sequence)): | |
if sequence[pos] == sequence[cnd]: | |
kmp_table[pos] = kmp_table[cnd] | |
else: | |
kmp_table[pos] = cnd | |
cnd = kmp_table[cnd] | |
while cnd >= 0 and sequence[pos] != sequence[cnd]: | |
cnd = kmp_table[cnd] | |
cnd += 1 | |
return kmp_table | |
def apply_kmp_algo(input_list, seq_to_find, kmp_table): | |
""" | |
Computes whether a sequence is contained into another, using the KMP table previously computed. | |
:param input_list: the sequence on which to look for. | |
:param seq_to_find: the sought sequence | |
:param kmp_table: the kmp table associated with the sequence to find. | |
:return: True whether the sequence to find appears indeed in the other reference sequence, False otherwise. | |
""" | |
idx_begin = 0 | |
idx_current = 0 | |
while idx_begin + idx_current < len(input_list): | |
if seq_to_find[idx_current] == input_list[idx_begin + idx_current]: | |
idx_current += 1 | |
if idx_current == len(seq_to_find): | |
return True | |
else: | |
if kmp_table[idx_current] > -1: | |
idx_begin += idx_current - kmp_table[idx_current] | |
idx_current = kmp_table[idx_current] | |
else: | |
idx_begin += idx_current + 1 | |
idx_current = 0 | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment