Last active
November 26, 2020 22:29
-
-
Save praeclarum/31f37946b49a58930967 to your computer and use it in GitHub Desktop.
Parser combinator in F# tuned to perform "well enough" on iOS (Xamarin)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Parsing | |
/// Remember where we are in the code. | |
/// This is a struct to keep memory pressure down. | |
/// (Significant perf improvements on iOS.) | |
type ParseState = | |
struct | |
val Code : string | |
val Index : int | |
new (code : string, index : int) = | |
{ Code = code; Index = index } | |
end | |
member p.Rem = p.Code.Length - p.Index | |
member p.WithIndex i = ParseState (p.Code, i) | |
/// Parsing results in either a value and the next parse state | |
/// or a failure (NoResult). | |
type ParseResult<'a> = | |
| Value of 'a * ParseState | |
| NoResult | |
/// A parser simply takes a parse state and produces a value | |
/// and the proceeding parse state. | |
type Parser<'a> = ParseState -> ParseResult<'a> | |
/// Stupid .NET number parsing functions can't work on substrings. | |
/// This is here just to keep the memory of lexing down. | |
let inline parseInt (s : string) (startIndex : int) (length : int) : int = | |
let mutable r = 0 | |
let mutable i = 0 | |
while i < length do | |
r <- (r * 10) + (int s.[startIndex + i] - int '0') | |
i <- i + 1 | |
r | |
/// Lexing is slow mkay, and we don't want to slow it down | |
/// by constantly allocating strings. So we have this little | |
/// class to lazily get substrings from the source code. | |
type Substring = | |
struct | |
val Source : string | |
val Index : int | |
val Length : int | |
new (source : string, index : int, length : int) = | |
{ Source = source; Index = index; Length = length } | |
end | |
member this.Value = if this.Length = 0 then "" else this.Source.Substring (this.Index, this.Length) | |
member this.GetInt32Value () = parseInt this.Source this.Index this.Length | |
member this.GetDoubleValue () = | |
System.Double.Parse (this.Source.Substring (this.Index, this.Length), | |
System.Globalization.CultureInfo.InvariantCulture) | |
member x.Eq (y : string) = | |
let n = x.Length | |
if n <> y.Length then false | |
else | |
let mutable ok = true | |
let mutable i = 0 | |
while ok && i < n do | |
ok <- x.Source.[x.Index + i] = y.[i] | |
i <- i + 1 | |
ok | |
override this.ToString () = this.Value | |
let emptySubstring = Substring ("", 0, 0) | |
/// Match two parsers in sequence. | |
let ( &&& ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a * 'b> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> | |
match b ap with | |
| Value (bv, bp) -> Value ((av, bv), bp) | |
| NoResult -> NoResult | |
| NoResult -> NoResult | |
/// Match one or the other parsers. | |
let ( ||| ) (a : Parser<'a>) (b : Parser<'a>) : Parser<'a> = | |
fun p -> | |
match a p with | |
| NoResult -> | |
match b p with | |
| NoResult -> NoResult | |
| x -> x | |
| x -> x | |
/// Transform the result of a parser. Use this to construct | |
/// your AST. | |
let ( >>> ) (a : Parser<'a>) (m : 'a -> 'b) : Parser<'b> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> Value (m av, ap) | |
| NoResult -> NoResult | |
/// Match two parsers in sequence and return the second value. | |
let ( &&> ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'b> = | |
fun p -> | |
match a p with | |
| Value (_, ap) -> | |
match b ap with | |
| Value (bv, bp) -> Value (bv, bp) | |
| NoResult -> NoResult | |
| NoResult -> NoResult | |
/// Match two parsers in sequence and return the first value. | |
let ( &&< ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> | |
match b ap with | |
| Value (_, bp) -> Value (av, bp) | |
| NoResult -> NoResult | |
| NoResult -> NoResult | |
/// Optional parsing - always succeeds | |
let opt (a : Parser<'a>) : Parser<'a option> = | |
fun p -> | |
match a p with | |
| Value (av, ap) -> Value (Some av, ap) | |
| NoResult -> Value (None, p) | |
/// Repeat a given parser 0 or more times | |
let zeroOrMore (a : Parser<'a>) : Parser<ResizeArray<'a>> = | |
fun p -> | |
let mutable cp = p | |
let mutable cont = true | |
let mutable r = ResizeArray<'a> () | |
while cont do | |
match a cp with | |
| Value (av, ap) -> r.Add (av); cp <- ap | |
| NoResult -> cont <- false | |
Value (r, cp) | |
/// Repeat a given parser 0 or more times with a given separator | |
let zeroOrMoreSep (sep) (a : Parser<'a>) : Parser<ResizeArray<'a>> = | |
fun p -> | |
let mutable cp = p | |
let mutable cont = true | |
let mutable r = ResizeArray<'a> () | |
let mutable needsSep = false | |
while cont do | |
let scp = | |
if needsSep then | |
match sep cp with | |
| Value (_, p) -> p | |
| NoResult -> cont <- false; cp | |
else cp | |
if cont then | |
match a scp with | |
| Value (av, ap) -> r.Add (av); cp <- ap; needsSep <- true | |
| NoResult -> cont <- false | |
Value (r, cp) | |
/// Skip white space in a ParseState | |
let ws (s : ParseState) : ParseState = | |
let n = s.Code.Length | |
if s.Index >= n then s | |
else | |
let mutable i = s.Index | |
let mutable ch = s.Code.[i] | |
while i < n && System.Char.IsWhiteSpace (ch) do | |
i <- i + 1 | |
if i < n then ch <- s.Code.[i] | |
if i = s.Index then s | |
else s.WithIndex i | |
/// A fixed sequence of characters | |
let tok (s : string) : Parser<string> = | |
let n = s.Length | |
fun p -> | |
let p = ws p | |
if p.Rem < n then NoResult | |
else | |
let mutable ok = true | |
let mutable i = 0 | |
while ok && i < n do | |
ok <- s.[i] = p.Code.[p.Index + i] | |
i <- i + 1 | |
if ok then Value (s, p.WithIndex (p.Index + n)) | |
else NoResult | |
let isIdentStart (ch : char) = | |
ch = '_' || System.Char.IsLetter (ch) | |
let isIdent (ch : char) = | |
ch = '_' || System.Char.IsLetterOrDigit (ch) | |
/// A C-style identifier | |
let ident (p : ParseState) : ParseResult<Substring> = | |
let p = ws p | |
if p.Rem < 1 then NoResult | |
else | |
let mutable ok = isIdentStart p.Code.[p.Index] | |
if not ok then NoResult | |
else | |
let n = p.Code.Length | |
let start = p.Index | |
let mutable i = start | |
while ok && i < n do | |
ok <- isIdent p.Code.[i] | |
if ok then i <- i + 1 | |
let r = Substring (p.Code, start, i - start) | |
Value (r, p.WithIndex i) | |
/// Parse a number with optional leading sign. | |
/// This is stuck on American style numbers. | |
let number (leadingSign : bool) (allowDecimal : bool) (p : ParseState) : ParseResult<Substring> = | |
let p = ws p | |
if p.Rem < 1 then NoResult | |
else | |
let start = p.Index | |
let lead = p.Code.[p.Index] | |
let mutable ok, i = | |
match allowDecimal, leadingSign, lead with | |
| _, true, '+' -> true, start + 1 | |
| _, true, '-' -> true, start + 1 | |
| true, _, '.' -> true, start | |
| _, _, c when System.Char.IsDigit c -> true, start | |
| _ -> false, start | |
if not ok then NoResult | |
else | |
let n = p.Code.Length | |
let mutable hasDig = false | |
let mutable hasDecDot = false | |
let mutable hasExp = false | |
let mutable hasExpSign = false | |
while ok && i < n do | |
ok <- | |
match p.Code.[i] with | |
| '.' -> if not allowDecimal || hasDecDot then false | |
else hasDecDot <- true; true | |
| 'e' | |
| 'E' -> if not allowDecimal || not hasDig || hasExp then false | |
else hasExp <- true; true | |
| '-' -> if not allowDecimal || not hasExp || hasExpSign then false | |
else hasExpSign <- true; true | |
| '+' -> if not allowDecimal || not hasExp || hasExpSign then false | |
else hasExpSign <- true; true | |
| c when System.Char.IsDigit c -> hasDig <- true; true | |
| _ -> false | |
if ok then | |
i <- i + 1 | |
if hasDig then | |
let r = Substring (p.Code, start, i - start) | |
Value (r, p.WithIndex i) | |
else NoResult | |
/// Parse a string delineated by the same start and stop char | |
/// While '\' is obeyed as in C, no post-processing is performed | |
/// to transform them. | |
let str (term : char) (p : ParseState) : ParseResult<Substring> = | |
let p = ws p | |
if p.Rem < 2 then NoResult | |
else | |
let mutable ok = (p.Code.[p.Index] = term) | |
if not ok then NoResult | |
else | |
let n = p.Code.Length | |
let start = p.Index | |
let mutable i = start + 1 | |
while ok && i < n do | |
if p.Code.[i] = term then ok <- false; i <- i + 1 | |
else | |
if p.Code.[i] = '\\' && (i + 1) < n then | |
i <- i + 2 | |
else | |
i <- i + 1 | |
let r = Substring (p.Code, start + 1, i - start - 2) | |
Value (r, p.WithIndex i) | |
/// Parse a regex and return the whole thing. You won't believe how | |
/// slow these functions are. Profile their usage, it's amazing. | |
let re (pattern : string) : Parser<string> = | |
let r = new System.Text.RegularExpressions.Regex (pattern) | |
fun p -> | |
let m = r.Match (p.Code, p.Index) | |
if m.Success && m.Index = p.Index then Value (m.Value, p.WithIndex (p.Index + m.Length)) | |
else NoResult | |
/// Parse a regex and return one of the groups. | |
/// This is great when prototyping a language, but omg, | |
/// don't use this in production. Regex is amazingly slow | |
/// and memory hungry. | |
let reg (pattern : string) (groupIndex : int) : Parser<string> = | |
let r = new System.Text.RegularExpressions.Regex (pattern) | |
fun p -> | |
let m = r.Match (p.Code, p.Index) | |
if m.Success && m.Index = p.Index then Value (m.Groups.[groupIndex].Value, p.WithIndex (p.Index + m.Length)) | |
else NoResult |
Here it is parsing a little functional language:
let pVar = ident >>> EVariable
let pInt = (ws >> re "\\d+") >>> System.Int64.Parse >>> LInt >>> ELiteral
let pString = (ws >> re "\"[^\"]*\"") >>> LString >>> ELiteral
let pFloat = (ws >> re "(\\d+)((\\.\\d+)|([eE](\\+|\\-|)\\d+)|(\\.\\d+)([eE](\\+|\\-|)\\d+))") >>> System.Double.Parse >>> LFloat >>> ELiteral
let pBool = (ws >> re "true|false") >>> (fun x -> x = "true") >>> LBool >>> ELiteral
let rec pExpr (s : ParseState) =
let p =
pLet
||| pAbstraction
||| pFloat
||| pInt
||| pBool
||| pVar
||| pString
p s
and pLet (s : ParseState) =
let p =
(kw "let" &&& ident >>> snd)
&&& (op "=" &&& pExpr >>> snd)
&&& (kw "in" &&& pExpr >>> snd)
>>> (fun ((n, v), b) -> ELet (n, v, b))
p s
and pAbstraction (s : ParseState) =
let p =
(op "{" &&& ident >>> snd)
&&& (kw "in" &&& pExpr &&& op "}" >>> fst >>> snd)
>>> (fun (n, b) -> EAbstraction (n, b))
p s
let parseExp (code : string) =
let s = { Code = code; Index = 0; }
match pExpr s with
| Some (e, _) -> e
| None -> failwith "Cannot parse"
Here's an example of parsing SLN files:
let lp = tok "("
let rp = tok ")"
let eq = tok "="
let comma = tok ","
let dstr = str '\"' >>> (fun x -> x.ToString())
let sident = ident >>> (fun x -> x.ToString())
let zeroOrMoreA p = zeroOrMore p >>> (fun x -> x.ToArray ())
let parseHeader (s : ParseState) : ParseResult<string> =
let hashIndex = s.Code.IndexOf ('#', s.Index)
if hashIndex >= 0 then
let newLineIndex = s.Code.IndexOf ('\n', hashIndex)
if newLineIndex >= 0 then
Value (s.Code.Substring (hashIndex, newLineIndex - hashIndex), s.WithIndex (newLineIndex))
else NoResult
else NoResult
let parseSectionValue (s : ParseState) : ParseResult<string*string> =
let s = ws s
let newLineIndex = s.Code.IndexOf ('\n', s.Index)
let eqIndex = s.Code.IndexOf ('=', s.Index)
if eqIndex >= 0 && eqIndex < newLineIndex then
let k = s.Code.Substring(s.Index, eqIndex - s.Index).Trim()
let v = s.Code.Substring(eqIndex + 1, newLineIndex - eqIndex - 1).Trim()
Value ((k,v), s.WithIndex (newLineIndex+1))
else
NoResult
let parseSection =
sident
&&& (lp &&> sident &&< rp &&< eq)
&&& sident
&&& (zeroOrMoreA parseSectionValue)
&&< sident
>>> (fun (((t, b), v), vs) ->
{
SectionType = t
SectionBase = b
SectionValue = v
Values = vs
})
let parseProject =
(tok "Project" &&> lp &&> dstr &&< rp &&< eq)
&&& (dstr &&< comma)
&&& (dstr &&< comma)
&&& (dstr)
&&& (zeroOrMoreA parseSection)
&&< tok "EndProject"
>>> (fun ((((lg, n), p), pg), s) ->
{
LanguageGuid = lg
Name = n
RelativeWinPath = p
ProjectGuid = pg
ProjectSections = s
})
let parseSolution =
parseHeader
&&& (zeroOrMoreA parseProject)
&&& (tok "Global" &&> zeroOrMoreA parseSection &&< tok "EndGlobal")
>>> (fun ((_, ps), gs) ->
{
Projects = ps
GlobalSections = gs
})
let openString (slnString : string) : SolutionData =
let startState = ParseState (slnString, 0)
match parseSolution startState with
| Value (x,_) -> x
| _ -> failwith "Failed to parse solution file"
Quick comment on the regex, compiling them will help performance a lot.
Love the readability of the parser.
why not to use FParsec?
@gregoryyoung Ah good point! I do all my work on iOS though and regex can't compile there.
@vasily-kirichenko FParsec doesn't work on iOS. Plus, isn't this cute? Pretty sure its faster too (FParsec does a lot more book keeping).
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Cool stuff! I love parsing stuff in F#.