cloudRoutine · September 29, 2015 16:42
diff --git a/boyer_moore.fsx b/boyer_moore.fsx
 // Boyer Moore String Search Algorithm
 open System
 open System.Collections

 /// Returns the index of the given character in the English alphabet counting from 0
 let alphabet_index (ch:char) =
    let chlow   =   Char.ToLower( ch)
    let charnum =   Convert.ToInt32 chlow
    charnum - 97        // 'a' is ASCII character 97

 /// Returns the length of the match of the substrings of str 
 /// beginning at index1 and index2
 let match_length (str:string) (index1:int) (index2:int) =
    let mutable ind1 = index1
    let mutable ind2 = index2
    if      ind1 = ind2
    then    str.Length - ind1
    else    let mutable match_count = 0
            while ind1 < str.Length
               && ind2 < str.Length
               && str.[ind1] = str.[ind2] do
                
                match_count <- match_count + 1
                ind1        <- ind1 + 1
                ind2        <- ind2 + 1
            match_count

 /// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring 
 /// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time,
 /// where n is the length of S.
 let fundamental_preprocess (str:string) : int [] =
    match str.Length with
    | 0 ->  [||]        //  Handles the case of empty string
    | 1 ->  [|1|]       //  Handles the case of single character string
    | _ ->  let z = [| for x in str -> 0 |]
            z.[0] <- str.Length
            z.[1] <- match_length str 0 1
                
            for i in (2)..(z.[1]) do
                z.[i] <- z.[1] - i + 1
            
            let mutable l = 0
            let mutable r = 0
            for i in (2+z.[1])..(str.Length-1) do
                if      i <= r
                then    let k = i - l
                        let b = z.[k]
                        let a = r - i + 1

                        if      b < a
                        then    z.[i] <- b
                        else    z.[i] <- b + (match_length str a (r+1))
                                l <- i
                                r <- i+z.[i]-1
                else    z.[i] <- match_length str 0 i
                        if      z.[i] > 0
                        then    l <- i
                                r <- i+z.[i]-1
            z


 /// Generates R for S, which is an array indexed by the position of some 
 /// character c in the English alphabet. At that index in R is an array 
 /// of length |S|+1, specifying for each index i in S (plus the index after S) 
 /// the next location of character c encountered when traversing S from 
 /// right to left starting at i.
 let bad_character_table (str:string) =
    if      str.Length  = 0 
    then    [| for a in 0..25 -> [||] |] 
    else    let R       = [| for a in 0..25 ->  [|-1|] |]
            let alpha   = [| for a in 0..25 ->   -1    |]

            for i,c in ( str.ToCharArray() 
                            |> Seq.ofArray 
                            |> Seq.mapi ( fun q d  -> q,d ) ) do
                alpha.[alphabet_index c] <- i
                for j,a in ( alpha 
                            |> Seq.ofArray 
                            |> Seq.mapi ( fun q d  -> q,d ) ) do
                    R.[j] <- Array.append R.[j] [|a|]
            R

 /// Generates L for S, an array used in the implementation of the strong 
 /// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i)
 /// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring 
 /// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P 
 /// relative to T such that no instances of P in T are skipped and a suffix 
 /// of P[:L[ i]] matches the substring of T matched by a suffix of P in the 
 /// previous match attempt. Specifically, if the mismatch took place at position 
 /// i-1 in P, the shift magnitude is given by the equation P.length - L.[i]. 
 /// In the case that L.[i] = -1, the full shift table is used.
 /// Since only proper suffixes matter, L.[0] = -1.
 let good_suffix_table (str:string) =
    let L = [| for c in str -> -1 |]
    let N = fundamental_preprocess ( str.ToCharArray() |> Array.rev |> string )
            |> Array.rev
    for j in 0..(str.Length-2) do
        let i = str.Length - N.[j]
        if      i <> str.Length
        then    L.[i] <- j
    L


 /// Generates F for S, an array used in a special case of the good suffix rule in 
 /// the Boyer-Moore string search algorithm. F[i] is the length of the longest 
 /// suffix of S[i:] that is also a prefix of S. In the cases it is used, the 
 /// shift magnitude of the pattern P relative to the text T is len(P) - F[i] 
 /// for a mismatch occurring at i-1
 let full_shift_table (str:string) =
    let F = [| for x in str -> 0 |]
    let Z = fundamental_preprocess str
    let mutable longest = 0
    for i,zv in  (  Z   |>  Array.rev
                        |>  Seq.ofArray 
                        |>  Seq.mapi ( fun q d  -> q,d ) ) do
        longest <-  if      zv = i + 1
                    then    max zv longest
                    else    longest
        F.[str.Length - i - 1 ] <- longest
    F

 /// Implementation of the Boyer-Moore string search algorithm. This finds all 
 /// occurrences of P in T, and incorporates numerous ways of pre-processing 
 /// the pattern to determine the optimal amount to shift the string and skip 
 /// comparisons. In practice it runs in O(m) (and even sublinear) time, where 
 /// m is the length of T. This implementation performs a case-insensitive
 /// search on ASCII alphabetic characters, spaces not included.
 let boyer_moore_search (P:string) (T:string) =
    if  P.Length = 0 || T.Length = 0 || T.Length < P.Length 
    then []
    else    
        let mutable matches = []
        let R       = bad_character_table   P
        let L       = good_suffix_table     P 
        let F       = full_shift_table      P
        
        let mutable k = P.Length - 1    // Represents alignment of end of P relative to T
        let mutable previous_k = - 1    // Represents alignment in previous phase

        while k < T.Length do
            let mutable i = P.Length - 1    // Character to compare in P
            let mutable h = k               // Character to compare in T
            // Matches starting from end of P
            while i >= 0 && h > previous_k && P.[i] = T.[h] do
                    i <- i - 1
                    h <- h - 1
            if      i = -1 || h = previous_k    // Match has been found
            then    matches <- ( k - P.Length + 1 ) :: matches
                    k   <-  if      P.Length > 1 
                            then    k +  P.Length - F.[1]
                            else    1
            // No match, shift by max of bad character and good suffix rules
            else    let char_shift      = i - R.[(alphabet_index T.[h])].[i]
                    let suffix_shift    = 
                        // Mismatch happened on first attempt
                        if      i + 1 = P.Length
                        then    1
                        // Matched suffix does not appear anywhere in P
                        elif    L.[i+1] = -1
                        then    P.Length - F.[i+1]
                        // Matched Suffix appears in P
                        else    P.Length - L.[i+1]
                    let shift = max char_shift  suffix_shift
                    // Galil's rule
                    previous_k <-   if      shift >= i + 1
                                    then    k
                                    else    previous_k
                    k <- k + shift
        matches
	// Boyer Moore String Search Algorithm
	open System
	open System.Collections

	/// Returns the index of the given character in the English alphabet counting from 0
	let alphabet_index (ch:char) =
	let chlow = Char.ToLower( ch)
	let charnum = Convert.ToInt32 chlow
	charnum - 97 // 'a' is ASCII character 97

	/// Returns the length of the match of the substrings of str
	/// beginning at index1 and index2
	let match_length (str:string) (index1:int) (index2:int) =
	let mutable ind1 = index1
	let mutable ind2 = index2
	if ind1 = ind2
	then str.Length - ind1
	else let mutable match_count = 0
	while ind1 < str.Length
	&& ind2 < str.Length
	&& str.[ind1] = str.[ind2] do

	match_count <- match_count + 1
	ind1 <- ind1 + 1
	ind2 <- ind2 + 1
	match_count

	/// Returns Z, the Fundamental Preprocessing of S. Z[i] is the length of the substring
	/// beginning at i which is also a prefix of S. This pre-processing is done in O(n) time,
	/// where n is the length of S.
	let fundamental_preprocess (str:string) : int [] =
	match str.Length with
	\| 0 -> [\|\|] // Handles the case of empty string
	\| 1 -> [\|1\|] // Handles the case of single character string
	\| _ -> let z = [\| for x in str -> 0 \|]
	z.[0] <- str.Length
	z.[1] <- match_length str 0 1

	for i in (2)..(z.[1]) do
	z.[i] <- z.[1] - i + 1

	let mutable l = 0
	let mutable r = 0
	for i in (2+z.[1])..(str.Length-1) do
	if i <= r
	then let k = i - l
	let b = z.[k]
	let a = r - i + 1

	if b < a
	then z.[i] <- b
	else z.[i] <- b + (match_length str a (r+1))
	l <- i
	r <- i+z.[i]-1
	else z.[i] <- match_length str 0 i
	if z.[i] > 0
	then l <- i
	r <- i+z.[i]-1
	z


	/// Generates R for S, which is an array indexed by the position of some
	/// character c in the English alphabet. At that index in R is an array
	/// of length \|S\|+1, specifying for each index i in S (plus the index after S)
	/// the next location of character c encountered when traversing S from
	/// right to left starting at i.
	let bad_character_table (str:string) =
	if str.Length = 0
	then [\| for a in 0..25 -> [\|\|] \|]
	else let R = [\| for a in 0..25 -> [\|-1\|] \|]
	let alpha = [\| for a in 0..25 -> -1 \|]

	for i,c in ( str.ToCharArray()
	\|> Seq.ofArray
	\|> Seq.mapi ( fun q d -> q,d ) ) do
	alpha.[alphabet_index c] <- i
	for j,a in ( alpha
	\|> Seq.ofArray
	\|> Seq.mapi ( fun q d -> q,d ) ) do
	R.[j] <- Array.append R.[j] [\|a\|]
	R

	/// Generates L for S, an array used in the implementation of the strong
	/// good suffix rule. L.[i] = k, the largest position in str such that str.substring(i)
	/// ( the suffix of S starting at i ) matches a suffix of str.substring(0,k) ( a substring
	/// in str ending at k ). Used in Boyer-Moore, L gives an amount to shift P
	/// relative to T such that no instances of P in T are skipped and a suffix
	/// of P[:L[ i]] matches the substring of T matched by a suffix of P in the
	/// previous match attempt. Specifically, if the mismatch took place at position
	/// i-1 in P, the shift magnitude is given by the equation P.length - L.[i].
	/// In the case that L.[i] = -1, the full shift table is used.
	/// Since only proper suffixes matter, L.[0] = -1.
	let good_suffix_table (str:string) =
	let L = [\| for c in str -> -1 \|]
	let N = fundamental_preprocess ( str.ToCharArray() \|> Array.rev \|> string )
	\|> Array.rev
	for j in 0..(str.Length-2) do
	let i = str.Length - N.[j]
	if i <> str.Length
	then L.[i] <- j
	L


	/// Generates F for S, an array used in a special case of the good suffix rule in
	/// the Boyer-Moore string search algorithm. F[i] is the length of the longest
	/// suffix of S[i:] that is also a prefix of S. In the cases it is used, the
	/// shift magnitude of the pattern P relative to the text T is len(P) - F[i]
	/// for a mismatch occurring at i-1
	let full_shift_table (str:string) =
	let F = [\| for x in str -> 0 \|]
	let Z = fundamental_preprocess str
	let mutable longest = 0
	for i,zv in ( Z \|> Array.rev
	\|> Seq.ofArray
	\|> Seq.mapi ( fun q d -> q,d ) ) do
	longest <- if zv = i + 1
	then max zv longest
	else longest
	F.[str.Length - i - 1 ] <- longest
	F

	/// Implementation of the Boyer-Moore string search algorithm. This finds all
	/// occurrences of P in T, and incorporates numerous ways of pre-processing
	/// the pattern to determine the optimal amount to shift the string and skip
	/// comparisons. In practice it runs in O(m) (and even sublinear) time, where
	/// m is the length of T. This implementation performs a case-insensitive
	/// search on ASCII alphabetic characters, spaces not included.
	let boyer_moore_search (P:string) (T:string) =
	if P.Length = 0 \|\| T.Length = 0 \|\| T.Length < P.Length
	then []
	else
	let mutable matches = []
	let R = bad_character_table P
	let L = good_suffix_table P
	let F = full_shift_table P

	let mutable k = P.Length - 1 // Represents alignment of end of P relative to T
	let mutable previous_k = - 1 // Represents alignment in previous phase

	while k < T.Length do
	let mutable i = P.Length - 1 // Character to compare in P
	let mutable h = k // Character to compare in T
	// Matches starting from end of P
	while i >= 0 && h > previous_k && P.[i] = T.[h] do
	i <- i - 1
	h <- h - 1
	if i = -1 \|\| h = previous_k // Match has been found
	then matches <- ( k - P.Length + 1 ) :: matches
	k <- if P.Length > 1
	then k + P.Length - F.[1]
	else 1
	// No match, shift by max of bad character and good suffix rules
	else let char_shift = i - R.[(alphabet_index T.[h])].[i]
	let suffix_shift =
	// Mismatch happened on first attempt
	if i + 1 = P.Length
	then 1
	// Matched suffix does not appear anywhere in P
	elif L.[i+1] = -1
	then P.Length - F.[i+1]
	// Matched Suffix appears in P
	else P.Length - L.[i+1]
	let shift = max char_shift suffix_shift
	// Galil's rule
	previous_k <- if shift >= i + 1
	then k
	else previous_k
	k <- k + shift
	matches
No results found