Created
November 1, 2011 03:32
-
-
Save mavc/1329810 to your computer and use it in GitHub Desktop.
Boyer-Moore string search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Boyer-Moore exact string search, implemented from the description | |
given in: | |
http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm | |
''' | |
def _bad_character_shift(pat): | |
''' Calculates the bad character shift as the distance of the | |
rightmost occurance of a character to the end (without counting | |
the last character of the string). | |
>>> b = _bad_character_shift('ampanman') | |
>>> sorted(b.items()) | |
[('a', 1), ('m', 2), ('n', 3), ('p', 5)] | |
''' | |
shift = {} | |
# for every character in the string, record its rightmost occurence | |
# as distance from the last character. | |
m = len(pat) - 1 | |
for i, ch in enumerate(pat[:-1]): | |
shift[ch] = m - i | |
return shift | |
def _good_suffix_shift(pat): | |
''' Source: http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/bmen.htm | |
>>> c = _good_suffix_shift('anpanman') | |
>>> c | |
[6, 6, 6, 6, 6, 6, 3, 8, 1] | |
''' | |
# Starting from the right, compute the position of the widest border | |
# of the suffix starting at i. | |
# | |
# e.g. 0 1 2 3 4 5 6 7 | |
# a b b a b a b | |
# | |
# We get f[2] == 4, for the border "bab" for babab. | |
# Position 7 would indicate the empty string. | |
# | |
# We will calculate the value of shifts in the first loop, by the | |
# borders that cannot be extended to the left. | |
# | |
# Position 2 has border 'bab' beginning at Position 4. Since | |
# this border cannot be extended ('bbab' != 'abab'), we store | |
# the shift from Position 4 to Position 2 (s[4] = 4-2). | |
m = len(pat) | |
f = [0] * (m + 1) # borders | |
s = [0] * (m + 1) # shifts | |
f[m] = j = m + 1 | |
for i in range(m, 0, -1): | |
# Starting from the last element, check if we can extend the border for this | |
# character (which starts at j). | |
while (j <= m and pat[i-1] != pat[j-1]): | |
# If we can't, extend the border, then store the shift in s[j]. | |
# (We'll need to shift the pattern this number to match suffixes.) | |
s[j] = s[j] or j - i | |
j = f[j] # For the next iteration, check the earliest suffix minus one | |
# character for a possible suffix match. | |
# Now, j-1 is the earliest known border for the previous character. | |
j -= 1 | |
f[i-1] = j | |
# "Case 2" | |
# In the second loop, we consider that for each pattern, a suffix can occur at the | |
# beginning of the string. We need to be able to store shifts for these cases as well. | |
# Note: The case when a pattern is not a substring of the str to be processed is a special | |
# case of "Case 2". | |
# For each index for which s[i] is empty: | |
# Upon a mismatch at i, we know that there is not shift that will align a candidate pattern | |
# within the string. Therefore, we shift until the head of the string matches the matching | |
# suffix (after the mismatch position). | |
# For i=0...f[0], the shift value is given by f[0], since that is the position of the widest | |
# border of the whole string. Therefore, shifting that value will match the head of the string | |
# with the suffix. | |
j = f[0] | |
for i in range(m + 1): | |
s[i] = s[i] or j | |
# However, when i < j, we have to take the next shortest border (since we will not have | |
# already matched the pattern pat[j:] | |
if (i == j): j = f[i] | |
# Now we're done! Now, s[i] represents how the good suffix shift for a match of pat[i:] that | |
# differs at position i - 1. (and s[0] represents the good suffix shift for a complete match). | |
return s | |
def index(s, pat, start=0): | |
''' Returns the index of the first occurence of pat within s | |
>>> index('hello', 'goodbye') | |
-1 | |
>>> index('hello', 'hello') | |
0 | |
>>> index('abracadabra', 'abra') | |
0 | |
>>> index('abracadabra', 'abra', start=1) | |
7 | |
>>> index('abracadabra', 'abra', start=7) | |
7 | |
>>> index('abracadabra', 'abra', start=8) | |
-1 | |
''' | |
if len(s) < len(pat) + start: return -1 | |
m = len(pat) | |
n = len(s) | |
bad = _bad_character_shift(pat) | |
suf = _good_suffix_shift(pat) | |
i = start | |
while (i + m <= n): | |
for j, ch in enumerate(reversed(s[i:i+m])): | |
if (ch != pat[-1-j]): | |
# Upon a mismatch, advance forward the max of the | |
# good suffix and bad character shifts. | |
bshift = (bad[ch] if ch in bad else m) - j # can be negative. | |
sshift = suf[m-j] | |
i += max(bshift, sshift) | |
break | |
else: | |
return i | |
# (Not found). | |
return -1 | |
def indexall(s, pat, start=0): | |
''' Returns the indices of all occurences of pat within s | |
>>> indexall('hello', 'goodbye') | |
[] | |
>>> indexall('hello', 'hello') | |
[0] | |
>>> indexall('hello', 'l') | |
[2, 3] | |
>>> indexall('abracadabra', 'abra') | |
[0, 7] | |
''' | |
if len(s) < len(pat) + start: return [] | |
m = len(pat) | |
n = len(s) | |
bad = _bad_character_shift(pat) | |
suf = _good_suffix_shift(pat) | |
indices = [] | |
i = start | |
while (i + m <= n): | |
for j, ch in enumerate(reversed(s[i:i+m])): | |
if (ch != pat[-1-j]): | |
# Upon a mismatch, advance forward the max of the | |
# good suffix and bad character shifts. | |
bshift = bad[ch] if ch in bad else m | |
sshift = suf[m-j] | |
i += max(bshift, sshift) | |
break | |
else: | |
indices.append(i) | |
i += suf[0] | |
return indices | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment