Skip to content

Instantly share code, notes, and snippets.

@praeclarum
Last active November 26, 2020 22:29
Show Gist options
  • Save praeclarum/31f37946b49a58930967 to your computer and use it in GitHub Desktop.
Save praeclarum/31f37946b49a58930967 to your computer and use it in GitHub Desktop.
Parser combinator in F# tuned to perform "well enough" on iOS (Xamarin)
module Parsing
/// Remember where we are in the code.
/// This is a struct to keep memory pressure down.
/// (Significant perf improvements on iOS.)
type ParseState =
struct
val Code : string
val Index : int
new (code : string, index : int) =
{ Code = code; Index = index }
end
member p.Rem = p.Code.Length - p.Index
member p.WithIndex i = ParseState (p.Code, i)
/// Parsing results in either a value and the next parse state
/// or a failure (NoResult).
type ParseResult<'a> =
| Value of 'a * ParseState
| NoResult
/// A parser simply takes a parse state and produces a value
/// and the proceeding parse state.
type Parser<'a> = ParseState -> ParseResult<'a>
/// Stupid .NET number parsing functions can't work on substrings.
/// This is here just to keep the memory of lexing down.
let inline parseInt (s : string) (startIndex : int) (length : int) : int =
let mutable r = 0
let mutable i = 0
while i < length do
r <- (r * 10) + (int s.[startIndex + i] - int '0')
i <- i + 1
r
/// Lexing is slow mkay, and we don't want to slow it down
/// by constantly allocating strings. So we have this little
/// class to lazily get substrings from the source code.
type Substring =
struct
val Source : string
val Index : int
val Length : int
new (source : string, index : int, length : int) =
{ Source = source; Index = index; Length = length }
end
member this.Value = if this.Length = 0 then "" else this.Source.Substring (this.Index, this.Length)
member this.GetInt32Value () = parseInt this.Source this.Index this.Length
member this.GetDoubleValue () =
System.Double.Parse (this.Source.Substring (this.Index, this.Length),
System.Globalization.CultureInfo.InvariantCulture)
member x.Eq (y : string) =
let n = x.Length
if n <> y.Length then false
else
let mutable ok = true
let mutable i = 0
while ok && i < n do
ok <- x.Source.[x.Index + i] = y.[i]
i <- i + 1
ok
override this.ToString () = this.Value
let emptySubstring = Substring ("", 0, 0)
/// Match two parsers in sequence.
let ( &&& ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a * 'b> =
fun p ->
match a p with
| Value (av, ap) ->
match b ap with
| Value (bv, bp) -> Value ((av, bv), bp)
| NoResult -> NoResult
| NoResult -> NoResult
/// Match one or the other parsers.
let ( ||| ) (a : Parser<'a>) (b : Parser<'a>) : Parser<'a> =
fun p ->
match a p with
| NoResult ->
match b p with
| NoResult -> NoResult
| x -> x
| x -> x
/// Transform the result of a parser. Use this to construct
/// your AST.
let ( >>> ) (a : Parser<'a>) (m : 'a -> 'b) : Parser<'b> =
fun p ->
match a p with
| Value (av, ap) -> Value (m av, ap)
| NoResult -> NoResult
/// Match two parsers in sequence and return the second value.
let ( &&> ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'b> =
fun p ->
match a p with
| Value (_, ap) ->
match b ap with
| Value (bv, bp) -> Value (bv, bp)
| NoResult -> NoResult
| NoResult -> NoResult
/// Match two parsers in sequence and return the first value.
let ( &&< ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a> =
fun p ->
match a p with
| Value (av, ap) ->
match b ap with
| Value (_, bp) -> Value (av, bp)
| NoResult -> NoResult
| NoResult -> NoResult
/// Optional parsing - always succeeds
let opt (a : Parser<'a>) : Parser<'a option> =
fun p ->
match a p with
| Value (av, ap) -> Value (Some av, ap)
| NoResult -> Value (None, p)
/// Repeat a given parser 0 or more times
let zeroOrMore (a : Parser<'a>) : Parser<ResizeArray<'a>> =
fun p ->
let mutable cp = p
let mutable cont = true
let mutable r = ResizeArray<'a> ()
while cont do
match a cp with
| Value (av, ap) -> r.Add (av); cp <- ap
| NoResult -> cont <- false
Value (r, cp)
/// Repeat a given parser 0 or more times with a given separator
let zeroOrMoreSep (sep) (a : Parser<'a>) : Parser<ResizeArray<'a>> =
fun p ->
let mutable cp = p
let mutable cont = true
let mutable r = ResizeArray<'a> ()
let mutable needsSep = false
while cont do
let scp =
if needsSep then
match sep cp with
| Value (_, p) -> p
| NoResult -> cont <- false; cp
else cp
if cont then
match a scp with
| Value (av, ap) -> r.Add (av); cp <- ap; needsSep <- true
| NoResult -> cont <- false
Value (r, cp)
/// Skip white space in a ParseState
let ws (s : ParseState) : ParseState =
let n = s.Code.Length
if s.Index >= n then s
else
let mutable i = s.Index
let mutable ch = s.Code.[i]
while i < n && System.Char.IsWhiteSpace (ch) do
i <- i + 1
if i < n then ch <- s.Code.[i]
if i = s.Index then s
else s.WithIndex i
/// A fixed sequence of characters
let tok (s : string) : Parser<string> =
let n = s.Length
fun p ->
let p = ws p
if p.Rem < n then NoResult
else
let mutable ok = true
let mutable i = 0
while ok && i < n do
ok <- s.[i] = p.Code.[p.Index + i]
i <- i + 1
if ok then Value (s, p.WithIndex (p.Index + n))
else NoResult
let isIdentStart (ch : char) =
ch = '_' || System.Char.IsLetter (ch)
let isIdent (ch : char) =
ch = '_' || System.Char.IsLetterOrDigit (ch)
/// A C-style identifier
let ident (p : ParseState) : ParseResult<Substring> =
let p = ws p
if p.Rem < 1 then NoResult
else
let mutable ok = isIdentStart p.Code.[p.Index]
if not ok then NoResult
else
let n = p.Code.Length
let start = p.Index
let mutable i = start
while ok && i < n do
ok <- isIdent p.Code.[i]
if ok then i <- i + 1
let r = Substring (p.Code, start, i - start)
Value (r, p.WithIndex i)
/// Parse a number with optional leading sign.
/// This is stuck on American style numbers.
let number (leadingSign : bool) (allowDecimal : bool) (p : ParseState) : ParseResult<Substring> =
let p = ws p
if p.Rem < 1 then NoResult
else
let start = p.Index
let lead = p.Code.[p.Index]
let mutable ok, i =
match allowDecimal, leadingSign, lead with
| _, true, '+' -> true, start + 1
| _, true, '-' -> true, start + 1
| true, _, '.' -> true, start
| _, _, c when System.Char.IsDigit c -> true, start
| _ -> false, start
if not ok then NoResult
else
let n = p.Code.Length
let mutable hasDig = false
let mutable hasDecDot = false
let mutable hasExp = false
let mutable hasExpSign = false
while ok && i < n do
ok <-
match p.Code.[i] with
| '.' -> if not allowDecimal || hasDecDot then false
else hasDecDot <- true; true
| 'e'
| 'E' -> if not allowDecimal || not hasDig || hasExp then false
else hasExp <- true; true
| '-' -> if not allowDecimal || not hasExp || hasExpSign then false
else hasExpSign <- true; true
| '+' -> if not allowDecimal || not hasExp || hasExpSign then false
else hasExpSign <- true; true
| c when System.Char.IsDigit c -> hasDig <- true; true
| _ -> false
if ok then
i <- i + 1
if hasDig then
let r = Substring (p.Code, start, i - start)
Value (r, p.WithIndex i)
else NoResult
/// Parse a string delineated by the same start and stop char
/// While '\' is obeyed as in C, no post-processing is performed
/// to transform them.
let str (term : char) (p : ParseState) : ParseResult<Substring> =
let p = ws p
if p.Rem < 2 then NoResult
else
let mutable ok = (p.Code.[p.Index] = term)
if not ok then NoResult
else
let n = p.Code.Length
let start = p.Index
let mutable i = start + 1
while ok && i < n do
if p.Code.[i] = term then ok <- false; i <- i + 1
else
if p.Code.[i] = '\\' && (i + 1) < n then
i <- i + 2
else
i <- i + 1
let r = Substring (p.Code, start + 1, i - start - 2)
Value (r, p.WithIndex i)
/// Parse a regex and return the whole thing. You won't believe how
/// slow these functions are. Profile their usage, it's amazing.
let re (pattern : string) : Parser<string> =
let r = new System.Text.RegularExpressions.Regex (pattern)
fun p ->
let m = r.Match (p.Code, p.Index)
if m.Success && m.Index = p.Index then Value (m.Value, p.WithIndex (p.Index + m.Length))
else NoResult
/// Parse a regex and return one of the groups.
/// This is great when prototyping a language, but omg,
/// don't use this in production. Regex is amazingly slow
/// and memory hungry.
let reg (pattern : string) (groupIndex : int) : Parser<string> =
let r = new System.Text.RegularExpressions.Regex (pattern)
fun p ->
let m = r.Match (p.Code, p.Index)
if m.Success && m.Index = p.Index then Value (m.Groups.[groupIndex].Value, p.WithIndex (p.Index + m.Length))
else NoResult
@praeclarum
Copy link
Author

praeclarum commented May 15, 2016

Here's an example of parsing SLN files:

let lp = tok "("
let rp = tok ")"
let eq = tok "="
let comma = tok ","
let dstr = str '\"' >>> (fun x -> x.ToString())
let sident = ident >>> (fun x -> x.ToString())
let zeroOrMoreA p = zeroOrMore p >>> (fun x -> x.ToArray ())

let parseHeader (s : ParseState) : ParseResult<string> =
    let hashIndex = s.Code.IndexOf ('#', s.Index)
    if hashIndex >= 0 then
        let newLineIndex = s.Code.IndexOf ('\n', hashIndex)
        if newLineIndex >= 0 then
            Value (s.Code.Substring (hashIndex, newLineIndex - hashIndex), s.WithIndex (newLineIndex))
        else NoResult
    else NoResult

let parseSectionValue (s : ParseState) : ParseResult<string*string> =
    let s = ws s
    let newLineIndex = s.Code.IndexOf ('\n', s.Index)
    let eqIndex = s.Code.IndexOf ('=', s.Index)
    if eqIndex >= 0 && eqIndex < newLineIndex then
        let k = s.Code.Substring(s.Index, eqIndex - s.Index).Trim()
        let v = s.Code.Substring(eqIndex + 1, newLineIndex - eqIndex - 1).Trim()
        Value ((k,v), s.WithIndex (newLineIndex+1))
    else
        NoResult

let parseSection =
    sident
    &&& (lp &&> sident &&< rp &&< eq)
    &&& sident
    &&& (zeroOrMoreA parseSectionValue)
    &&< sident
    >>> (fun (((t, b), v), vs) ->
        {
            SectionType = t
            SectionBase = b
            SectionValue = v
            Values = vs
        })

let parseProject =
    (tok "Project" &&> lp &&> dstr &&< rp &&< eq)
    &&& (dstr &&< comma)
    &&& (dstr &&< comma)
    &&& (dstr)
    &&& (zeroOrMoreA parseSection)
    &&< tok "EndProject"
    >>> (fun ((((lg, n), p), pg), s) ->
        {
            LanguageGuid = lg
            Name = n
            RelativeWinPath = p
            ProjectGuid = pg
            ProjectSections = s
        })

let parseSolution =
    parseHeader
    &&& (zeroOrMoreA parseProject)
    &&& (tok "Global" &&> zeroOrMoreA parseSection &&< tok "EndGlobal")
    >>> (fun ((_, ps), gs) ->
        {
            Projects = ps
            GlobalSections = gs
        })

let openString (slnString : string) : SolutionData =

    let startState = ParseState (slnString, 0)

    match parseSolution startState with
    | Value (x,_) -> x
    | _ -> failwith "Failed to parse solution file"

@gregoryyoung
Copy link

Quick comment on the regex, compiling them will help performance a lot.

Love the readability of the parser.

@vasily-kirichenko
Copy link

why not to use FParsec?

@praeclarum
Copy link
Author

@gregoryyoung Ah good point! I do all my work on iOS though and regex can't compile there.

@vasily-kirichenko FParsec doesn't work on iOS. Plus, isn't this cute? Pretty sure its faster too (FParsec does a lot more book keeping).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment