Skip to content

Instantly share code, notes, and snippets.

@praeclarum
Last active November 26, 2020 22:29
Show Gist options
  • Save praeclarum/31f37946b49a58930967 to your computer and use it in GitHub Desktop.
Save praeclarum/31f37946b49a58930967 to your computer and use it in GitHub Desktop.
Parser combinator in F# tuned to perform "well enough" on iOS (Xamarin)
module Parsing
/// Remember where we are in the code.
/// This is a struct to keep memory pressure down.
/// (Significant perf improvements on iOS.)
type ParseState =
struct
val Code : string
val Index : int
new (code : string, index : int) =
{ Code = code; Index = index }
end
member p.Rem = p.Code.Length - p.Index
member p.WithIndex i = ParseState (p.Code, i)
/// Parsing results in either a value and the next parse state
/// or a failure (NoResult).
type ParseResult<'a> =
| Value of 'a * ParseState
| NoResult
/// A parser simply takes a parse state and produces a value
/// and the proceeding parse state.
type Parser<'a> = ParseState -> ParseResult<'a>
/// Stupid .NET number parsing functions can't work on substrings.
/// This is here just to keep the memory of lexing down.
let inline parseInt (s : string) (startIndex : int) (length : int) : int =
let mutable r = 0
let mutable i = 0
while i < length do
r <- (r * 10) + (int s.[startIndex + i] - int '0')
i <- i + 1
r
/// Lexing is slow mkay, and we don't want to slow it down
/// by constantly allocating strings. So we have this little
/// class to lazily get substrings from the source code.
type Substring =
struct
val Source : string
val Index : int
val Length : int
new (source : string, index : int, length : int) =
{ Source = source; Index = index; Length = length }
end
member this.Value = if this.Length = 0 then "" else this.Source.Substring (this.Index, this.Length)
member this.GetInt32Value () = parseInt this.Source this.Index this.Length
member this.GetDoubleValue () =
System.Double.Parse (this.Source.Substring (this.Index, this.Length),
System.Globalization.CultureInfo.InvariantCulture)
member x.Eq (y : string) =
let n = x.Length
if n <> y.Length then false
else
let mutable ok = true
let mutable i = 0
while ok && i < n do
ok <- x.Source.[x.Index + i] = y.[i]
i <- i + 1
ok
override this.ToString () = this.Value
let emptySubstring = Substring ("", 0, 0)
/// Match two parsers in sequence.
let ( &&& ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a * 'b> =
fun p ->
match a p with
| Value (av, ap) ->
match b ap with
| Value (bv, bp) -> Value ((av, bv), bp)
| NoResult -> NoResult
| NoResult -> NoResult
/// Match one or the other parsers.
let ( ||| ) (a : Parser<'a>) (b : Parser<'a>) : Parser<'a> =
fun p ->
match a p with
| NoResult ->
match b p with
| NoResult -> NoResult
| x -> x
| x -> x
/// Transform the result of a parser. Use this to construct
/// your AST.
let ( >>> ) (a : Parser<'a>) (m : 'a -> 'b) : Parser<'b> =
fun p ->
match a p with
| Value (av, ap) -> Value (m av, ap)
| NoResult -> NoResult
/// Match two parsers in sequence and return the second value.
let ( &&> ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'b> =
fun p ->
match a p with
| Value (_, ap) ->
match b ap with
| Value (bv, bp) -> Value (bv, bp)
| NoResult -> NoResult
| NoResult -> NoResult
/// Match two parsers in sequence and return the first value.
let ( &&< ) (a : Parser<'a>) (b : Parser<'b>) : Parser<'a> =
fun p ->
match a p with
| Value (av, ap) ->
match b ap with
| Value (_, bp) -> Value (av, bp)
| NoResult -> NoResult
| NoResult -> NoResult
/// Optional parsing - always succeeds
let opt (a : Parser<'a>) : Parser<'a option> =
fun p ->
match a p with
| Value (av, ap) -> Value (Some av, ap)
| NoResult -> Value (None, p)
/// Repeat a given parser 0 or more times
let zeroOrMore (a : Parser<'a>) : Parser<ResizeArray<'a>> =
fun p ->
let mutable cp = p
let mutable cont = true
let mutable r = ResizeArray<'a> ()
while cont do
match a cp with
| Value (av, ap) -> r.Add (av); cp <- ap
| NoResult -> cont <- false
Value (r, cp)
/// Repeat a given parser 0 or more times with a given separator
let zeroOrMoreSep (sep) (a : Parser<'a>) : Parser<ResizeArray<'a>> =
fun p ->
let mutable cp = p
let mutable cont = true
let mutable r = ResizeArray<'a> ()
let mutable needsSep = false
while cont do
let scp =
if needsSep then
match sep cp with
| Value (_, p) -> p
| NoResult -> cont <- false; cp
else cp
if cont then
match a scp with
| Value (av, ap) -> r.Add (av); cp <- ap; needsSep <- true
| NoResult -> cont <- false
Value (r, cp)
/// Skip white space in a ParseState
let ws (s : ParseState) : ParseState =
let n = s.Code.Length
if s.Index >= n then s
else
let mutable i = s.Index
let mutable ch = s.Code.[i]
while i < n && System.Char.IsWhiteSpace (ch) do
i <- i + 1
if i < n then ch <- s.Code.[i]
if i = s.Index then s
else s.WithIndex i
/// A fixed sequence of characters
let tok (s : string) : Parser<string> =
let n = s.Length
fun p ->
let p = ws p
if p.Rem < n then NoResult
else
let mutable ok = true
let mutable i = 0
while ok && i < n do
ok <- s.[i] = p.Code.[p.Index + i]
i <- i + 1
if ok then Value (s, p.WithIndex (p.Index + n))
else NoResult
let isIdentStart (ch : char) =
ch = '_' || System.Char.IsLetter (ch)
let isIdent (ch : char) =
ch = '_' || System.Char.IsLetterOrDigit (ch)
/// A C-style identifier
let ident (p : ParseState) : ParseResult<Substring> =
let p = ws p
if p.Rem < 1 then NoResult
else
let mutable ok = isIdentStart p.Code.[p.Index]
if not ok then NoResult
else
let n = p.Code.Length
let start = p.Index
let mutable i = start
while ok && i < n do
ok <- isIdent p.Code.[i]
if ok then i <- i + 1
let r = Substring (p.Code, start, i - start)
Value (r, p.WithIndex i)
/// Parse a number with optional leading sign.
/// This is stuck on American style numbers.
let number (leadingSign : bool) (allowDecimal : bool) (p : ParseState) : ParseResult<Substring> =
let p = ws p
if p.Rem < 1 then NoResult
else
let start = p.Index
let lead = p.Code.[p.Index]
let mutable ok, i =
match allowDecimal, leadingSign, lead with
| _, true, '+' -> true, start + 1
| _, true, '-' -> true, start + 1
| true, _, '.' -> true, start
| _, _, c when System.Char.IsDigit c -> true, start
| _ -> false, start
if not ok then NoResult
else
let n = p.Code.Length
let mutable hasDig = false
let mutable hasDecDot = false
let mutable hasExp = false
let mutable hasExpSign = false
while ok && i < n do
ok <-
match p.Code.[i] with
| '.' -> if not allowDecimal || hasDecDot then false
else hasDecDot <- true; true
| 'e'
| 'E' -> if not allowDecimal || not hasDig || hasExp then false
else hasExp <- true; true
| '-' -> if not allowDecimal || not hasExp || hasExpSign then false
else hasExpSign <- true; true
| '+' -> if not allowDecimal || not hasExp || hasExpSign then false
else hasExpSign <- true; true
| c when System.Char.IsDigit c -> hasDig <- true; true
| _ -> false
if ok then
i <- i + 1
if hasDig then
let r = Substring (p.Code, start, i - start)
Value (r, p.WithIndex i)
else NoResult
/// Parse a string delineated by the same start and stop char
/// While '\' is obeyed as in C, no post-processing is performed
/// to transform them.
let str (term : char) (p : ParseState) : ParseResult<Substring> =
let p = ws p
if p.Rem < 2 then NoResult
else
let mutable ok = (p.Code.[p.Index] = term)
if not ok then NoResult
else
let n = p.Code.Length
let start = p.Index
let mutable i = start + 1
while ok && i < n do
if p.Code.[i] = term then ok <- false; i <- i + 1
else
if p.Code.[i] = '\\' && (i + 1) < n then
i <- i + 2
else
i <- i + 1
let r = Substring (p.Code, start + 1, i - start - 2)
Value (r, p.WithIndex i)
/// Parse a regex and return the whole thing. You won't believe how
/// slow these functions are. Profile their usage, it's amazing.
let re (pattern : string) : Parser<string> =
let r = new System.Text.RegularExpressions.Regex (pattern)
fun p ->
let m = r.Match (p.Code, p.Index)
if m.Success && m.Index = p.Index then Value (m.Value, p.WithIndex (p.Index + m.Length))
else NoResult
/// Parse a regex and return one of the groups.
/// This is great when prototyping a language, but omg,
/// don't use this in production. Regex is amazingly slow
/// and memory hungry.
let reg (pattern : string) (groupIndex : int) : Parser<string> =
let r = new System.Text.RegularExpressions.Regex (pattern)
fun p ->
let m = r.Match (p.Code, p.Index)
if m.Success && m.Index = p.Index then Value (m.Groups.[groupIndex].Value, p.WithIndex (p.Index + m.Length))
else NoResult
@praeclarum
Copy link
Author

Here's an example of it parsing STEP files to get 3D geometry. On iOS, this is able to parse a 1.8 MB file in 2 seconds.

    let semi = tok ";"
    let comma = tok ","
    let lparen = tok "("
    let rparen = tok ")"
    let eqop = tok "="
    let magic = tok "ISO-10303-21"
    let header = tok "HEADER"
    let endsec = tok "ENDSEC"
    let data = tok "DATA"

    let noneStringVal = StringExpr emptySubstring
    let trueVal = BoolExpr true
    let falseVal = BoolExpr false

    let noneStringExpr = tok "'NONE'" >>> (fun x -> noneStringVal)
    let trueExpr = tok ".T." >>> (fun x -> trueVal)
    let falseExpr = tok ".F." >>> (fun x -> falseVal)
    let dollarExpr = tok "$" >>> (fun x -> DollarExpr)
    let starExpr = tok "*" >>> (fun x -> StarExpr)
    let numberExpr = number true true >>> (fun x -> NumberExpr (try x.GetDoubleValue ()
                                                                with ex -> failwithf "`%s` is not a number" x.Value))
    let stringExpr = str '\'' >>> (fun x -> StringExpr x)
    let symExpr = str '.' >>> (fun x -> SymbolExpr x)

    let refExpr = tok "#" &&& number false false >>> (fun (_,x) -> RefExpr (x.GetInt32Value ()))

    let rec expr s =
        (trueExpr ||| falseExpr
         ||| starExpr ||| dollarExpr
         ||| noneStringExpr
         ||| numberExpr
         ||| funcall ||| vector
         ||| stringExpr ||| symExpr ||| refExpr          
         ) s

    and vector s =
        (lparen &&& args &&& rparen >>> fst >>> snd) s

    and funcall s =
        (ident &&& (lparen &&& args &&& rparen >>> fst >>> snd)
         >>> (fun (x : Substring, e) -> FuncallExpr (x, e))) s

    and args s =
        (zeroOrMoreSep comma expr >>> (fun x -> TupleExpr x)) s

    let headerSec =
        let prefix = magic &&& semi &&& header &&& semi
        let suffix = endsec &&& semi
        prefix &&& zeroOrMore (funcall &&& semi >>> fst) &&& suffix >>> fst >>> snd

    let unitThing = 
        lparen &&& zeroOrMore funcall &&& rparen >>> fst >>> snd >>> (fun x -> TupleExpr x)

    let entity =
        (refExpr &&& (eqop &&& (expr ||| unitThing) &&& semi >>> fst >>> snd))

    let dataSec =
        let prefix = data &&& semi
        let suffix = endsec &&& semi
        prefix &&& zeroOrMore entity &&& suffix >>> fst >>> snd

    let parse (url : string) (code : string) =
        let s = ParseState (code, 0)
        let p = (headerSec &&& dataSec) s
        match p with
        | Value ((h, es), _) ->
            let ents =
                es
                |> Seq.choose getEntity
                |> Map.ofSeq
            { Url = url; Entities = ents }
        | _ -> { Url = url; Entities = Map.empty }

@TIHan
Copy link

TIHan commented Sep 26, 2015

Cool stuff! I love parsing stuff in F#.

@praeclarum
Copy link
Author

Here it is parsing a little functional language:

    let pVar = ident >>> EVariable

    let pInt = (ws >> re "\\d+") >>> System.Int64.Parse >>> LInt >>> ELiteral
    let pString = (ws >> re "\"[^\"]*\"") >>> LString >>> ELiteral
    let pFloat = (ws >> re "(\\d+)((\\.\\d+)|([eE](\\+|\\-|)\\d+)|(\\.\\d+)([eE](\\+|\\-|)\\d+))") >>> System.Double.Parse >>> LFloat >>> ELiteral
    let pBool = (ws >> re "true|false") >>> (fun x -> x = "true") >>> LBool >>> ELiteral

    let rec pExpr (s : ParseState) =
        let p =
            pLet
            ||| pAbstraction
            ||| pFloat
            ||| pInt
            ||| pBool
            ||| pVar
            ||| pString
        p s

    and pLet (s : ParseState) =
        let p = 
            (kw "let" &&& ident >>> snd)
            &&& (op "=" &&& pExpr >>> snd)
            &&& (kw "in" &&& pExpr >>> snd)
            >>> (fun ((n, v), b) -> ELet (n, v, b))
        p s

    and pAbstraction (s : ParseState) =
        let p = 
            (op "{" &&& ident >>> snd)
            &&& (kw "in" &&& pExpr &&& op "}" >>> fst >>> snd)
            >>> (fun (n, b) -> EAbstraction (n, b))
        p s


    let parseExp (code : string) =
        let s = { Code = code; Index = 0; }
        match pExpr s with
        | Some (e, _) -> e
        | None -> failwith "Cannot parse"

@praeclarum
Copy link
Author

praeclarum commented May 15, 2016

Here's an example of parsing SLN files:

let lp = tok "("
let rp = tok ")"
let eq = tok "="
let comma = tok ","
let dstr = str '\"' >>> (fun x -> x.ToString())
let sident = ident >>> (fun x -> x.ToString())
let zeroOrMoreA p = zeroOrMore p >>> (fun x -> x.ToArray ())

let parseHeader (s : ParseState) : ParseResult<string> =
    let hashIndex = s.Code.IndexOf ('#', s.Index)
    if hashIndex >= 0 then
        let newLineIndex = s.Code.IndexOf ('\n', hashIndex)
        if newLineIndex >= 0 then
            Value (s.Code.Substring (hashIndex, newLineIndex - hashIndex), s.WithIndex (newLineIndex))
        else NoResult
    else NoResult

let parseSectionValue (s : ParseState) : ParseResult<string*string> =
    let s = ws s
    let newLineIndex = s.Code.IndexOf ('\n', s.Index)
    let eqIndex = s.Code.IndexOf ('=', s.Index)
    if eqIndex >= 0 && eqIndex < newLineIndex then
        let k = s.Code.Substring(s.Index, eqIndex - s.Index).Trim()
        let v = s.Code.Substring(eqIndex + 1, newLineIndex - eqIndex - 1).Trim()
        Value ((k,v), s.WithIndex (newLineIndex+1))
    else
        NoResult

let parseSection =
    sident
    &&& (lp &&> sident &&< rp &&< eq)
    &&& sident
    &&& (zeroOrMoreA parseSectionValue)
    &&< sident
    >>> (fun (((t, b), v), vs) ->
        {
            SectionType = t
            SectionBase = b
            SectionValue = v
            Values = vs
        })

let parseProject =
    (tok "Project" &&> lp &&> dstr &&< rp &&< eq)
    &&& (dstr &&< comma)
    &&& (dstr &&< comma)
    &&& (dstr)
    &&& (zeroOrMoreA parseSection)
    &&< tok "EndProject"
    >>> (fun ((((lg, n), p), pg), s) ->
        {
            LanguageGuid = lg
            Name = n
            RelativeWinPath = p
            ProjectGuid = pg
            ProjectSections = s
        })

let parseSolution =
    parseHeader
    &&& (zeroOrMoreA parseProject)
    &&& (tok "Global" &&> zeroOrMoreA parseSection &&< tok "EndGlobal")
    >>> (fun ((_, ps), gs) ->
        {
            Projects = ps
            GlobalSections = gs
        })

let openString (slnString : string) : SolutionData =

    let startState = ParseState (slnString, 0)

    match parseSolution startState with
    | Value (x,_) -> x
    | _ -> failwith "Failed to parse solution file"

@gregoryyoung
Copy link

Quick comment on the regex, compiling them will help performance a lot.

Love the readability of the parser.

@vasily-kirichenko
Copy link

why not to use FParsec?

@praeclarum
Copy link
Author

@gregoryyoung Ah good point! I do all my work on iOS though and regex can't compile there.

@vasily-kirichenko FParsec doesn't work on iOS. Plus, isn't this cute? Pretty sure its faster too (FParsec does a lot more book keeping).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment