Last active
November 26, 2020 22:29
-
-
Save praeclarum/31f37946b49a58930967 to your computer and use it in GitHub Desktop.
Parser combinator in F# tuned to perform "well enough" on iOS (Xamarin)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Parsing | |
/// Remember where we are in the code. | |
/// This is a struct to keep memory pressure down. | |
/// (Significant perf improvements on iOS.) | |
type ParseState = | |
struct | |
val Code : string | |
val Index : int | |
new (code : string, index : int) = | |
{ Code = code; Index = index } | |
end | |
member p.Rem = p.Code.Length - p.Index | |
member p.WithIndex i = ParseState (p.Code, i) | |
/// Parsing results in either a value and the next parse state | |
/// or a failure (NoResult). | |
type ParseResult<'a> = | |
| Value of 'a * ParseState | |
| NoResult | |
/// A parser simply takes a parse state and produces a value | |
/// and the proceeding parse state. | |
type Parser<'a> = ParseState -> ParseResult<'a> | |
/// Stupid .NET number parsing functions can't work on substrings. | |
/// This is here just to keep the memory of lexing down. | |
let inline parseInt (s : string) (startIndex : int) (length : int) : int = | |
let mutable r = 0 | |
let mutable i = 0 | |
while i < length do | |
r <- (r * 10) + (int s.[startIndex + i] - int '0') | |
i <- i + 1 | |
r | |
/// Lexing is slow mkay, and we don't want to slow it down | |
/// by constantly allocating strings. So we have this little | |
/// class to lazily get substrings from the source code. | |
type Substring = | |
struct | |
val Source : string | |
val Index : int | |
val Length : int | |
new (source : string, index : int, length : int) = | |
{ Source = source; Index = index; Length = length } | |
end | |
member this.Value = if this.Length = 0 then "" else this.Source.Substring (this.Index, this.Length) | |
member this.GetInt32Value () = parseInt this.Source this.Index this.Length | |
member this.GetDoubleValue () = | |
System.Double.Parse (this.Source.Substring (this.Index, this.Length), | |
System.Globalization.CultureInfo.InvariantCulture) | |
member x.Eq (y : string) = | |
let n = x.Length | |
if n <> y.Length then false | |
else | |
let mutable ok = true | |
let mutable i = 0 | |
while ok && i < n do | |
ok <- x.Source.[x.Index + i] = y.[i] | |
i <- i + 1 | |
ok | |
override this.ToString () = this.Value | |
let emptySubstring = Substring ("", 0, 0) | |
/// Match two parsers in sequence. | |
let ( &&& ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a * 'b> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> | |
match b ap with | |
| Value (bv, bp) -> Value ((av, bv), bp) | |
| NoResult -> NoResult | |
| NoResult -> NoResult | |
/// Match one or the other parsers. | |
let ( ||| ) (a : Parser<'a>) (b : Parser<'a>) : Parser<'a> = | |
fun p -> | |
match a p with | |
| NoResult -> | |
match b p with | |
| NoResult -> NoResult | |
| x -> x | |
| x -> x | |
/// Transform the result of a parser. Use this to construct | |
/// your AST. | |
let ( >>> ) (a : Parser<'a>) (m : 'a -> 'b) : Parser<'b> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> Value (m av, ap) | |
| NoResult -> NoResult | |
/// Match two parsers in sequence and return the second value. | |
let ( &&> ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'b> = | |
fun p -> | |
match a p with | |
| Value (_, ap) -> | |
match b ap with | |
| Value (bv, bp) -> Value (bv, bp) | |
| NoResult -> NoResult | |
| NoResult -> NoResult | |
/// Match two parsers in sequence and return the first value. | |
let ( &&< ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> | |
match b ap with | |
| Value (_, bp) -> Value (av, bp) | |
| NoResult -> NoResult | |
| NoResult -> NoResult | |
/// Optional parsing - always succeeds | |
let opt (a : Parser<'a>) : Parser<'a option> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> Value (Some av, ap) | |
| NoResult -> Value (None, p) | |
/// Repeat a given parser 0 or more times | |
let zeroOrMore (a : Parser<'a>) : Parser<ResizeArray<'a>> = | |
fun p -> | |
let mutable cp = p | |
let mutable cont = true | |
let mutable r = ResizeArray<'a> () | |
while cont do | |
match a cp with | |
| Value (av, ap) -> r.Add (av); cp <- ap | |
| NoResult -> cont <- false | |
Value (r, cp) | |
/// Repeat a given parser 0 or more times with a given separator | |
let zeroOrMoreSep (sep) (a : Parser<'a>) : Parser<ResizeArray<'a>> = | |
fun p -> | |
let mutable cp = p | |
let mutable cont = true | |
let mutable r = ResizeArray<'a> () | |
let mutable needsSep = false | |
while cont do | |
let scp = | |
if needsSep then | |
match sep cp with | |
| Value (_, p) -> p | |
| NoResult -> cont <- false; cp | |
else cp | |
if cont then | |
match a scp with | |
| Value (av, ap) -> r.Add (av); cp <- ap; needsSep <- true | |
| NoResult -> cont <- false | |
Value (r, cp) | |
/// Skip white space in a ParseState | |
let ws (s : ParseState) : ParseState = | |
let n = s.Code.Length | |
if s.Index >= n then s | |
else | |
let mutable i = s.Index | |
let mutable ch = s.Code.[i] | |
while i < n && System.Char.IsWhiteSpace (ch) do | |
i <- i + 1 | |
if i < n then ch <- s.Code.[i] | |
if i = s.Index then s | |
else s.WithIndex i | |
/// A fixed sequence of characters | |
let tok (s : string) : Parser<string> = | |
let n = s.Length | |
fun p -> | |
let p = ws p | |
if p.Rem < n then NoResult | |
else | |
let mutable ok = true | |
let mutable i = 0 | |
while ok && i < n do | |
ok <- s.[i] = p.Code.[p.Index + i] | |
i <- i + 1 | |
if ok then Value (s, p.WithIndex (p.Index + n)) | |
else NoResult | |
let isIdentStart (ch : char) = | |
ch = '_' || System.Char.IsLetter (ch) | |
let isIdent (ch : char) = | |
ch = '_' || System.Char.IsLetterOrDigit (ch) | |
/// A C-style identifier | |
let ident (p : ParseState) : ParseResult<Substring> = | |
let p = ws p | |
if p.Rem < 1 then NoResult | |
else | |
let mutable ok = isIdentStart p.Code.[p.Index] | |
if not ok then NoResult | |
else | |
let n = p.Code.Length | |
let start = p.Index | |
let mutable i = start | |
while ok && i < n do | |
ok <- isIdent p.Code.[i] | |
if ok then i <- i + 1 | |
let r = Substring (p.Code, start, i - start) | |
Value (r, p.WithIndex i) | |
/// Parse a number with optional leading sign. | |
/// This is stuck on American style numbers. | |
let number (leadingSign : bool) (allowDecimal : bool) (p : ParseState) : ParseResult<Substring> = | |
let p = ws p | |
if p.Rem < 1 then NoResult | |
else | |
let start = p.Index | |
let lead = p.Code.[p.Index] | |
let mutable ok, i = | |
match allowDecimal, leadingSign, lead with | |
| _, true, '+' -> true, start + 1 | |
| _, true, '-' -> true, start + 1 | |
| true, _, '.' -> true, start | |
| _, _, c when System.Char.IsDigit c -> true, start | |
| _ -> false, start | |
if not ok then NoResult | |
else | |
let n = p.Code.Length | |
let mutable hasDig = false | |
let mutable hasDecDot = false | |
let mutable hasExp = false | |
let mutable hasExpSign = false | |
while ok && i < n do | |
ok <- | |
match p.Code.[i] with | |
| '.' -> if not allowDecimal || hasDecDot then false | |
else hasDecDot <- true; true | |
| 'e' | |
| 'E' -> if not allowDecimal || not hasDig || hasExp then false | |
else hasExp <- true; true | |
| '-' -> if not allowDecimal || not hasExp || hasExpSign then false | |
else hasExpSign <- true; true | |
| '+' -> if not allowDecimal || not hasExp || hasExpSign then false | |
else hasExpSign <- true; true | |
| c when System.Char.IsDigit c -> hasDig <- true; true | |
| _ -> false | |
if ok then | |
i <- i + 1 | |
if hasDig then | |
let r = Substring (p.Code, start, i - start) | |
Value (r, p.WithIndex i) | |
else NoResult | |
/// Parse a string delineated by the same start and stop char | |
/// While '\' is obeyed as in C, no post-processing is performed | |
/// to transform them. | |
let str (term : char) (p : ParseState) : ParseResult<Substring> = | |
let p = ws p | |
if p.Rem < 2 then NoResult | |
else | |
let mutable ok = (p.Code.[p.Index] = term) | |
if not ok then NoResult | |
else | |
let n = p.Code.Length | |
let start = p.Index | |
let mutable i = start + 1 | |
while ok && i < n do | |
if p.Code.[i] = term then ok <- false; i <- i + 1 | |
else | |
if p.Code.[i] = '\\' && (i + 1) < n then | |
i <- i + 2 | |
else | |
i <- i + 1 | |
let r = Substring (p.Code, start + 1, i - start - 2) | |
Value (r, p.WithIndex i) | |
/// Parse a regex and return the whole thing. You won't believe how | |
/// slow these functions are. Profile their usage, it's amazing. | |
let re (pattern : string) : Parser<string> = | |
let r = new System.Text.RegularExpressions.Regex (pattern) | |
fun p -> | |
let m = r.Match (p.Code, p.Index) | |
if m.Success && m.Index = p.Index then Value (m.Value, p.WithIndex (p.Index + m.Length)) | |
else NoResult | |
/// Parse a regex and return one of the groups. | |
/// This is great when prototyping a language, but omg, | |
/// don't use this in production. Regex is amazingly slow | |
/// and memory hungry. | |
let reg (pattern : string) (groupIndex : int) : Parser<string> = | |
let r = new System.Text.RegularExpressions.Regex (pattern) | |
fun p -> | |
let m = r.Match (p.Code, p.Index) | |
if m.Success && m.Index = p.Index then Value (m.Groups.[groupIndex].Value, p.WithIndex (p.Index + m.Length)) | |
else NoResult |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@gregoryyoung Ah good point! I do all my work on iOS though and regex can't compile there.
@vasily-kirichenko FParsec doesn't work on iOS. Plus, isn't this cute? Pretty sure its faster too (FParsec does a lot more book keeping).