Skip to content

Instantly share code, notes, and snippets.

@cloudRoutine
Created September 29, 2015 16:42
Show Gist options
  • Select an option

  • Save cloudRoutine/ef5f6794692260906ceb to your computer and use it in GitHub Desktop.

Select an option

Save cloudRoutine/ef5f6794692260906ceb to your computer and use it in GitHub Desktop.
Boyer Moore Search Algorithm
// Boyer Moore String Search Algorithm
open System
open System.Collections
/// Returns the index of the given character in the English alphabet counting from 0
let alphabet_index (ch:char) =
let chlow = Char.ToLower( ch)
let charnum = Convert.ToInt32 chlow
charnum - 97 // 'a' is ASCII character 97
/// Returns the length of the match of the substrings of str
/// beginning at index1 and index2
let match_length (str:string) (index1:int) (index2:int) =
let mutable ind1 = index1
let mutable ind2 = index2
if ind1 = ind2
then str.Length - ind1
else let mutable match_count = 0
while ind1 < str.Length
&& ind2 < str.Length
&& str.[ind1] = str.[ind2] do
match_count <- match_count + 1
ind1 <- ind1 + 1
ind2 <- ind2 + 1
match_count
/// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring
/// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time,
/// where n is the length of S.
let fundamental_preprocess (str:string) : int [] =
match str.Length with
| 0 -> [||] // Handles the case of empty string
| 1 -> [|1|] // Handles the case of single character string
| _ -> let z = [| for x in str -> 0 |]
z.[0] <- str.Length
z.[1] <- match_length str 0 1
for i in (2)..(z.[1]) do
z.[i] <- z.[1] - i + 1
let mutable l = 0
let mutable r = 0
for i in (2+z.[1])..(str.Length-1) do
if i <= r
then let k = i - l
let b = z.[k]
let a = r - i + 1
if b < a
then z.[i] <- b
else z.[i] <- b + (match_length str a (r+1))
l <- i
r <- i+z.[i]-1
else z.[i] <- match_length str 0 i
if z.[i] > 0
then l <- i
r <- i+z.[i]-1
z
/// Generates R for S, which is an array indexed by the position of some
/// character c in the English alphabet. At that index in R is an array
/// of length |S|+1, specifying for each index i in S (plus the index after S)
/// the next location of character c encountered when traversing S from
/// right to left starting at i.
let bad_character_table (str:string) =
if str.Length = 0
then [| for a in 0..25 -> [||] |]
else let R = [| for a in 0..25 -> [|-1|] |]
let alpha = [| for a in 0..25 -> -1 |]
for i,c in ( str.ToCharArray()
|> Seq.ofArray
|> Seq.mapi ( fun q d -> q,d ) ) do
alpha.[alphabet_index c] <- i
for j,a in ( alpha
|> Seq.ofArray
|> Seq.mapi ( fun q d -> q,d ) ) do
R.[j] <- Array.append R.[j] [|a|]
R
/// Generates L for S, an array used in the implementation of the strong
/// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i)
/// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring
/// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P
/// relative to T such that no instances of P in T are skipped and a suffix
/// of P[:L[ i]] matches the substring of T matched by a suffix of P in the
/// previous match attempt. Specifically, if the mismatch took place at position
/// i-1 in P, the shift magnitude is given by the equation P.length - L.[i].
/// In the case that L.[i] = -1, the full shift table is used.
/// Since only proper suffixes matter, L.[0] = -1.
let good_suffix_table (str:string) =
let L = [| for c in str -> -1 |]
let N = fundamental_preprocess ( str.ToCharArray() |> Array.rev |> string )
|> Array.rev
for j in 0..(str.Length-2) do
let i = str.Length - N.[j]
if i <> str.Length
then L.[i] <- j
L
/// Generates F for S, an array used in a special case of the good suffix rule in
/// the Boyer-Moore string search algorithm. F[i] is the length of the longest
/// suffix of S[i:] that is also a prefix of S. In the cases it is used, the
/// shift magnitude of the pattern P relative to the text T is len(P) - F[i]
/// for a mismatch occurring at i-1
let full_shift_table (str:string) =
let F = [| for x in str -> 0 |]
let Z = fundamental_preprocess str
let mutable longest = 0
for i,zv in ( Z |> Array.rev
|> Seq.ofArray
|> Seq.mapi ( fun q d -> q,d ) ) do
longest <- if zv = i + 1
then max zv longest
else longest
F.[str.Length - i - 1 ] <- longest
F
/// Implementation of the Boyer-Moore string search algorithm. This finds all
/// occurrences of P in T, and incorporates numerous ways of pre-processing
/// the pattern to determine the optimal amount to shift the string and skip
/// comparisons. In practice it runs in O(m) (and even sublinear) time, where
/// m is the length of T. This implementation performs a case-insensitive
/// search on ASCII alphabetic characters, spaces not included.
let boyer_moore_search (P:string) (T:string) =
if P.Length = 0 || T.Length = 0 || T.Length < P.Length
then []
else
let mutable matches = []
let R = bad_character_table P
let L = good_suffix_table P
let F = full_shift_table P
let mutable k = P.Length - 1 // Represents alignment of end of P relative to T
let mutable previous_k = - 1 // Represents alignment in previous phase
while k < T.Length do
let mutable i = P.Length - 1 // Character to compare in P
let mutable h = k // Character to compare in T
// Matches starting from end of P
while i >= 0 && h > previous_k && P.[i] = T.[h] do
i <- i - 1
h <- h - 1
if i = -1 || h = previous_k // Match has been found
then matches <- ( k - P.Length + 1 ) :: matches
k <- if P.Length > 1
then k + P.Length - F.[1]
else 1
// No match, shift by max of bad character and good suffix rules
else let char_shift = i - R.[(alphabet_index T.[h])].[i]
let suffix_shift =
// Mismatch happened on first attempt
if i + 1 = P.Length
then 1
// Matched suffix does not appear anywhere in P
elif L.[i+1] = -1
then P.Length - F.[i+1]
// Matched Suffix appears in P
else P.Length - L.[i+1]
let shift = max char_shift suffix_shift
// Galil's rule
previous_k <- if shift >= i + 1
then k
else previous_k
k <- k + shift
matches
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment