Created
January 16, 2016 09:19
-
-
Save ichiban/79bb08216fa2ba89904c to your computer and use it in GitHub Desktop.
Earley Recognizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Batteries | |
(**** symbols ****) | |
type symbol = | |
| Terminal of UChar.t | |
| Nonterminal of Text.t | |
let symbol_to_text = function | |
| Terminal char -> Text.of_uchar char | |
| Nonterminal label -> | |
let l, r = Text.of_string "<", Text.of_string ">" in | |
Text.join Text.empty [l; label; r] | |
let is_nonterminal = function | |
| Terminal _ -> false | |
| Nonterminal _ -> true | |
let char = function | |
| Terminal char -> char | |
| Nonterminal _ -> undefined () | |
(**** rules ****) | |
type 'a rule = | |
{ | |
label : Text.t; | |
symbols : symbol list; | |
action : 'a Enum.t -> 'a | |
} | |
let rule_to_text ?dot_at rule = | |
let dot = Text.of_string "●" in | |
let is_dot i = match dot_at with None -> false | Some j -> i = j in | |
let maybe_dot i = if is_dot i then [dot] else [] in | |
let rec symbols i = function | |
| [] -> maybe_dot i | |
| hd :: tl -> maybe_dot i @ symbol_to_text hd :: symbols (i + 1) tl | |
in | |
let space = Text.of_string " " in | |
let l, r = Text.of_string "<", Text.of_string ">" in | |
let label rule = Text.join Text.empty [l; rule.label; r] in | |
let arrow = Text.of_string "->" in | |
let symbols_and_dot = Text.join space (symbols 0 rule.symbols) in | |
Text.join space [label rule; arrow; symbols_and_dot] | |
type 'a t = 'a rule list | |
type 'a state = | |
{ | |
rule : 'a rule; | |
current : int; | |
origin : int; | |
data : 'a list; | |
} | |
let state_to_text state = | |
let rule = rule_to_text state.rule ~dot_at:state.current in | |
let separator = Text.of_string ", " in | |
let origin = state.origin |> String.of_int |> Text.of_string in | |
Text.join separator [rule; origin] | |
let is_complete state = | |
List.length state.rule.symbols = state.current | |
let is_completed_by state = function | |
| Terminal _ -> false | |
| Nonterminal label -> state.rule.label = label | |
let next_symbol state = | |
let symbols = state.rule.symbols in | |
if List.length symbols > state.current then | |
Some (List.at state.rule.symbols state.current) | |
else | |
None | |
let grammar_rules_for grammar = function | |
| Terminal _ -> Enum.empty () | |
| Nonterminal label -> | |
let same_label rule = (label = rule.label) in | |
List.enum grammar |> filter same_label | |
(* chart *) | |
let initial_states start grammar = | |
let start_label rule = start = rule.label in | |
let rule_to_state rule = | |
{ | |
rule = rule; | |
current = 0; | |
origin = 0; | |
data = []; | |
} | |
in | |
List.enum grammar | |
|> filter start_label | |
|> map rule_to_state | |
|> Set.of_enum | |
let initial_chart start length grammar = | |
let chart = Array.make (length + 1) (Set.empty, Set.empty) in | |
let unprocessed = initial_states start grammar in | |
let processed = Set.empty in | |
Array.set chart 0 (unprocessed, processed); | |
chart | |
let add_all_to_set chart index states = | |
let unprocessed, processed = Array.get chart index in | |
let unprocessed = Set.diff (Set.union states unprocessed) processed in | |
Array.set chart index (unprocessed, processed) | |
let add_to_set chart index state = | |
add_all_to_set chart index (Set.singleton state) | |
let pop_state chart index = | |
let unprocessed, processed = Array.get chart index in | |
if Set.is_empty unprocessed then | |
None | |
else | |
begin | |
let state, rest = Set.pop unprocessed in | |
let added = Set.add state processed in | |
Array.set chart index (rest, added); | |
Some state | |
end | |
let nullable_symbols grammar = | |
let rec search known = | |
let is_nullable rule = | |
match rule.symbols with | |
| [] -> true | |
| [x] -> Set.mem x known | |
| _ -> false | |
in | |
let symbol rule = Nonterminal rule.label in | |
let found = | |
List.enum grammar |> filter is_nullable |> map symbol |> Set.of_enum | |
in | |
if Set.equal found known then | |
known | |
else | |
search (Set.union known found) | |
in | |
search Set.empty | |
(* the actual algorithm *) | |
let predict chart state j grammar nullables = | |
let next_symbol = next_symbol state |> Option.get in | |
let rules = grammar_rules_for grammar next_symbol in | |
let rule_to_state rule = | |
{ | |
rule = rule; | |
current = 0; | |
origin = j; | |
data = []; | |
} in | |
let new_states = rules |> map rule_to_state |> Set.of_enum in | |
let is_nullable symbol = Set.mem symbol nullables in | |
let magical_completion = {state with current = state.current + 1 } in | |
add_all_to_set chart j new_states; | |
if is_nullable next_symbol then | |
add_to_set chart j magical_completion | |
let scan input chart state j = | |
let char = next_symbol state |> Option.get |> char in | |
let new_state = { | |
state with | |
current = state.current + 1; | |
} in | |
if j < Text.length input && Text.get input j = char then | |
add_to_set chart (j + 1) new_state | |
let complete chart state k = | |
let next_state state = { state with current = state.current + 1 } in | |
let _, processed = Array.get chart state.origin in | |
processed | |
|> Set.filter (next_symbol %> Option.is_some) | |
|> Set.filter (next_symbol %> Option.get %> is_completed_by state) | |
|> Set.map next_state | |
|> add_all_to_set chart k | |
let empty = [] | |
let rule label symbols action grammar = | |
{label = label; symbols = symbols; action = action} :: grammar | |
let parse input start grammar = | |
let length = Text.length input in | |
let chart = initial_chart start length grammar in | |
let nullables = nullable_symbols grammar in | |
let rec process_states i = | |
let process state = | |
if is_complete state then | |
complete chart state i | |
else if next_symbol state |> Option.get |> is_nonterminal then | |
predict chart state i grammar nullables | |
else | |
scan input chart state i; | |
process_states i | |
in | |
Option.may process (pop_state chart i) | |
in | |
foreach (0 -- length) process_states; | |
let print_state_set i = | |
Printf.printf "S(%d): %s\n" i (Text.left input i |> Text.to_string); | |
let print_state state = | |
Printf.printf "%s\n%!" (state_to_text state |> Text.to_string) | |
in | |
foreach (match Array.get chart i with _, p -> Set.enum p) print_state | |
in | |
foreach (0 -- length) print_state_set |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Batteries | |
type 'a t | |
type symbol = | |
| Terminal of UChar.t | |
| Nonterminal of Text.t | |
val empty : 'a t | |
val rule : Text.t -> symbol list -> ('a list -> 'a) -> 'a t -> 'a t | |
val parse : Text.t -> Text.t -> 'a t -> unit |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Batteries | |
let () = | |
let term char = Grammar.Terminal (UChar.of_char char) in | |
let nonterm label = Grammar.Nonterminal (Text.of_string label) in | |
let rule label symbols = Grammar.rule (Text.of_string label) symbols ignore in | |
let parse string label = | |
Grammar.parse (Text.of_string string) (Text.of_string label) | |
in | |
Grammar.empty | |
|> rule "Sum" [nonterm "Sum"; term '+'; nonterm "Product"] | |
|> rule "Sum" [nonterm "Sum"; term '-'; nonterm "Product"] | |
|> rule "Sum" [nonterm "Product"] | |
|> rule "Product" [nonterm "Product"; term '*'; nonterm "Factor"] | |
|> rule "Product" [nonterm "Product"; term '/'; nonterm "Factor"] | |
|> rule "Product" [nonterm "Factor"] | |
|> rule "Factor" [term '('; nonterm "Sum"; term ')'] | |
|> rule "Factor" [nonterm "Number"] | |
|> rule "Number" [term '0'; nonterm "Number"] | |
|> rule "Number" [term '1'; nonterm "Number"] | |
|> rule "Number" [term '2'; nonterm "Number"] | |
|> rule "Number" [term '3'; nonterm "Number"] | |
|> rule "Number" [term '4'; nonterm "Number"] | |
|> rule "Number" [term '5'; nonterm "Number"] | |
|> rule "Number" [term '6'; nonterm "Number"] | |
|> rule "Number" [term '7'; nonterm "Number"] | |
|> rule "Number" [term '8'; nonterm "Number"] | |
|> rule "Number" [term '9'; nonterm "Number"] | |
|> rule "Number" [term '0'] | |
|> rule "Number" [term '1'] | |
|> rule "Number" [term '2'] | |
|> rule "Number" [term '3'] | |
|> rule "Number" [term '4'] | |
|> rule "Number" [term '5'] | |
|> rule "Number" [term '6'] | |
|> rule "Number" [term '7'] | |
|> rule "Number" [term '8'] | |
|> rule "Number" [term '9'] | |
|> parse "1+(2*3-4)" "Sum" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment