Created
August 1, 2013 03:05
-
-
Save wweic/6128108 to your computer and use it in GitHub Desktop.
Parse regular expression pattern. using recursive descent.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* Parse a subset of regular expression, using recursive descent *) | |
(* modified from https://code.google.com/p/sharable-stuff/ *) | |
type regexp = | |
| Empty_String | |
| Char of char | |
| Union of regexp * regexp | |
| Concat of regexp * regexp | |
| Star of regexp | |
exception IllegalExpression | |
let string_to_regexp s = | |
let index = ref 0 in | |
let length = String.length s in | |
let lookahead _ = | |
if !index < length then | |
Some s.[!index] | |
else | |
None | |
in | |
let consume c = | |
if Some c = (lookahead ()) then | |
index := !index + 1 | |
else | |
raise IllegalExpression | |
and alphabet'contains = function | |
| Some c -> 'a' <= c & c <= 'z' | |
| None -> false | |
in | |
let starts'atom = function | |
| Some '(' -> true | |
| Some 'E' -> true | |
| co -> alphabet'contains co | |
in | |
let rec parseOR _ = | |
if starts'atom (lookahead ()) then | |
parseOR' (parseCONCAT ()) | |
else | |
raise IllegalExpression | |
and parseOR' re = | |
if Some '|' = (lookahead ()) then | |
let re' = parseOR' (parseCONCAT (consume '|')) in | |
Union (re, re') | |
else | |
re | |
and parseCONCAT _ = | |
if starts'atom (lookahead ()) then | |
parseCONCAT' (parseSTAR ()) | |
else | |
raise IllegalExpression | |
and parseCONCAT' re = | |
if starts'atom (lookahead ()) then | |
let re' = parseCONCAT' (parseSTAR ()) in | |
Concat (re, re') | |
else | |
re | |
and parseSTAR _ = | |
if starts'atom (lookahead ()) then | |
parseSTAR' (parseATOM ()) | |
else | |
raise IllegalExpression | |
and parseSTAR' re = | |
if Some '*' = (lookahead ()) then begin | |
consume '*'; | |
parseSTAR' (Star re) | |
end else | |
re | |
and parseATOM _ = | |
if Some '(' = (lookahead ()) then begin | |
let re = | |
consume '('; | |
parseOR () | |
in | |
consume ')'; | |
re | |
end else if Some 'E' = (lookahead ()) then begin | |
consume 'E'; | |
Empty_String | |
end else if alphabet'contains (lookahead ()) then | |
let c = match (lookahead ()) with | |
| Some c -> c | |
| None -> raise IllegalExpression | |
in | |
let re = Char c in | |
consume c; | |
re | |
else | |
raise IllegalExpression | |
in | |
parseOR ();; | |
(* test *) | |
string_to_regexp "ab";; | |
string_to_regexp "a+";; | |
string_to_regexp "E";; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment