Created
September 29, 2015 16:42
-
-
Save cloudRoutine/ef5f6794692260906ceb to your computer and use it in GitHub Desktop.
Boyer Moore Search Algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Boyer Moore String Search Algorithm | |
| open System | |
| open System.Collections | |
| /// Returns the index of the given character in the English alphabet counting from 0 | |
| let alphabet_index (ch:char) = | |
| let chlow = Char.ToLower( ch) | |
| let charnum = Convert.ToInt32 chlow | |
| charnum - 97 // 'a' is ASCII character 97 | |
| /// Returns the length of the match of the substrings of str | |
| /// beginning at index1 and index2 | |
| let match_length (str:string) (index1:int) (index2:int) = | |
| let mutable ind1 = index1 | |
| let mutable ind2 = index2 | |
| if ind1 = ind2 | |
| then str.Length - ind1 | |
| else let mutable match_count = 0 | |
| while ind1 < str.Length | |
| && ind2 < str.Length | |
| && str.[ind1] = str.[ind2] do | |
| match_count <- match_count + 1 | |
| ind1 <- ind1 + 1 | |
| ind2 <- ind2 + 1 | |
| match_count | |
| /// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring | |
| /// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time, | |
| /// where n is the length of S. | |
| let fundamental_preprocess (str:string) : int [] = | |
| match str.Length with | |
| | 0 -> [||] // Handles the case of empty string | |
| | 1 -> [|1|] // Handles the case of single character string | |
| | _ -> let z = [| for x in str -> 0 |] | |
| z.[0] <- str.Length | |
| z.[1] <- match_length str 0 1 | |
| for i in (2)..(z.[1]) do | |
| z.[i] <- z.[1] - i + 1 | |
| let mutable l = 0 | |
| let mutable r = 0 | |
| for i in (2+z.[1])..(str.Length-1) do | |
| if i <= r | |
| then let k = i - l | |
| let b = z.[k] | |
| let a = r - i + 1 | |
| if b < a | |
| then z.[i] <- b | |
| else z.[i] <- b + (match_length str a (r+1)) | |
| l <- i | |
| r <- i+z.[i]-1 | |
| else z.[i] <- match_length str 0 i | |
| if z.[i] > 0 | |
| then l <- i | |
| r <- i+z.[i]-1 | |
| z | |
| /// Generates R for S, which is an array indexed by the position of some | |
| /// character c in the English alphabet. At that index in R is an array | |
| /// of length |S|+1, specifying for each index i in S (plus the index after S) | |
| /// the next location of character c encountered when traversing S from | |
| /// right to left starting at i. | |
| let bad_character_table (str:string) = | |
| if str.Length = 0 | |
| then [| for a in 0..25 -> [||] |] | |
| else let R = [| for a in 0..25 -> [|-1|] |] | |
| let alpha = [| for a in 0..25 -> -1 |] | |
| for i,c in ( str.ToCharArray() | |
| |> Seq.ofArray | |
| |> Seq.mapi ( fun q d -> q,d ) ) do | |
| alpha.[alphabet_index c] <- i | |
| for j,a in ( alpha | |
| |> Seq.ofArray | |
| |> Seq.mapi ( fun q d -> q,d ) ) do | |
| R.[j] <- Array.append R.[j] [|a|] | |
| R | |
| /// Generates L for S, an array used in the implementation of the strong | |
| /// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i) | |
| /// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring | |
| /// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P | |
| /// relative to T such that no instances of P in T are skipped and a suffix | |
| /// of P[:L[ i]] matches the substring of T matched by a suffix of P in the | |
| /// previous match attempt. Specifically, if the mismatch took place at position | |
| /// i-1 in P, the shift magnitude is given by the equation P.length - L.[i]. | |
| /// In the case that L.[i] = -1, the full shift table is used. | |
| /// Since only proper suffixes matter, L.[0] = -1. | |
| let good_suffix_table (str:string) = | |
| let L = [| for c in str -> -1 |] | |
| let N = fundamental_preprocess ( str.ToCharArray() |> Array.rev |> string ) | |
| |> Array.rev | |
| for j in 0..(str.Length-2) do | |
| let i = str.Length - N.[j] | |
| if i <> str.Length | |
| then L.[i] <- j | |
| L | |
| /// Generates F for S, an array used in a special case of the good suffix rule in | |
| /// the Boyer-Moore string search algorithm. F[i] is the length of the longest | |
| /// suffix of S[i:] that is also a prefix of S. In the cases it is used, the | |
| /// shift magnitude of the pattern P relative to the text T is len(P) - F[i] | |
| /// for a mismatch occurring at i-1 | |
| let full_shift_table (str:string) = | |
| let F = [| for x in str -> 0 |] | |
| let Z = fundamental_preprocess str | |
| let mutable longest = 0 | |
| for i,zv in ( Z |> Array.rev | |
| |> Seq.ofArray | |
| |> Seq.mapi ( fun q d -> q,d ) ) do | |
| longest <- if zv = i + 1 | |
| then max zv longest | |
| else longest | |
| F.[str.Length - i - 1 ] <- longest | |
| F | |
| /// Implementation of the Boyer-Moore string search algorithm. This finds all | |
| /// occurrences of P in T, and incorporates numerous ways of pre-processing | |
| /// the pattern to determine the optimal amount to shift the string and skip | |
| /// comparisons. In practice it runs in O(m) (and even sublinear) time, where | |
| /// m is the length of T. This implementation performs a case-insensitive | |
| /// search on ASCII alphabetic characters, spaces not included. | |
| let boyer_moore_search (P:string) (T:string) = | |
| if P.Length = 0 || T.Length = 0 || T.Length < P.Length | |
| then [] | |
| else | |
| let mutable matches = [] | |
| let R = bad_character_table P | |
| let L = good_suffix_table P | |
| let F = full_shift_table P | |
| let mutable k = P.Length - 1 // Represents alignment of end of P relative to T | |
| let mutable previous_k = - 1 // Represents alignment in previous phase | |
| while k < T.Length do | |
| let mutable i = P.Length - 1 // Character to compare in P | |
| let mutable h = k // Character to compare in T | |
| // Matches starting from end of P | |
| while i >= 0 && h > previous_k && P.[i] = T.[h] do | |
| i <- i - 1 | |
| h <- h - 1 | |
| if i = -1 || h = previous_k // Match has been found | |
| then matches <- ( k - P.Length + 1 ) :: matches | |
| k <- if P.Length > 1 | |
| then k + P.Length - F.[1] | |
| else 1 | |
| // No match, shift by max of bad character and good suffix rules | |
| else let char_shift = i - R.[(alphabet_index T.[h])].[i] | |
| let suffix_shift = | |
| // Mismatch happened on first attempt | |
| if i + 1 = P.Length | |
| then 1 | |
| // Matched suffix does not appear anywhere in P | |
| elif L.[i+1] = -1 | |
| then P.Length - F.[i+1] | |
| // Matched Suffix appears in P | |
| else P.Length - L.[i+1] | |
| let shift = max char_shift suffix_shift | |
| // Galil's rule | |
| previous_k <- if shift >= i + 1 | |
| then k | |
| else previous_k | |
| k <- k + shift | |
| matches |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment