Created
July 14, 2021 09:33
-
-
Save leetschau/2bbfaf6f2c5175a4526a004d06fadcbe to your computer and use it in GitHub Desktop.
Understanding Parser Combinators Series
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// This is the implementation of the first 2 parts of the wonderful 4-parts | |
/// posts "Understanding Parser Combinators" of | |
/// [F# for fun and profit](https://fsharpforfunandprofit.com/): | |
module ParserLib1 = | |
open System | |
/// Type that represents Success/Failure in parsing | |
type ParseResult<'a> = | |
| Success of 'a | |
| Failure of string | |
/// Type that wraps a parsing function | |
type Parser<'T> = Parser of (string -> ParseResult<'T * string>) | |
/// Parse a single character | |
let pchar charToMatch = | |
// define a nested inner function | |
let innerFn str = | |
if String.IsNullOrEmpty(str) then | |
Failure "No more input" | |
else | |
let first = str.[0] | |
if first = charToMatch then | |
let remaining = str.[1..] | |
Success (charToMatch,remaining) | |
else | |
let msg = sprintf "Expecting '%c'. Got '%c'" charToMatch first | |
Failure msg | |
// return the "wrapped" inner function | |
Parser innerFn | |
/// Run a parser with some input | |
let run parser input = | |
// unwrap parser to get inner function | |
let (Parser innerFn) = parser | |
// call inner function with input | |
innerFn input | |
/// Combine two parsers as "A andThen B" | |
let andThen parser1 parser2 = | |
let innerFn input = | |
// run parser1 with the input | |
let result1 = run parser1 input | |
// test the result for Failure/Success | |
match result1 with | |
| Failure err -> | |
// return error from parser1 | |
Failure err | |
| Success (value1, remaining1) -> | |
// run parser2 with the remaining input | |
let result2 = run parser2 remaining1 | |
// test the result for Failure/Success | |
match result2 with | |
| Failure err -> | |
// return error from parser2 | |
Failure err | |
| Success (value2, remaining2) -> | |
// combine both values as a pair | |
let newValue = (value1, value2) | |
// return remaining input after parser2 | |
Success (newValue, remaining2) | |
// return the inner function | |
Parser innerFn | |
/// Infix version of andThen | |
let ( .>>. ) = andThen | |
/// Combine two parsers as "A orElse B" | |
let orElse parser1 parser2 = | |
let innerFn input = | |
// run parser1 with the input | |
let result1 = run parser1 input | |
// test the result for Failure/Success | |
match result1 with | |
| Success result -> | |
// if success, return the original result | |
result1 | |
| Failure err -> | |
// if failed, run parser2 with the input | |
let result2 = run parser2 input | |
// return parser2's result | |
result2 | |
// return the inner function | |
Parser innerFn | |
/// Infix version of orElse | |
let ( <|> ) = orElse | |
/// Choose any of a list of parsers | |
let choice listOfParsers = | |
List.reduce ( <|> ) listOfParsers | |
/// Choose any of a list of characters | |
let anyOf listOfChars = | |
listOfChars | |
|> List.map pchar // convert into parsers | |
|> choice | |
open ParserLib1 | |
module ParserLib2 = | |
let mapP f parser = | |
let innerFn input = | |
// run parser with the input | |
let result = run parser input | |
// test the result for Failure/Success | |
match result with | |
| Success (value, remaining) -> | |
// if success, return the value transformed by f | |
let newValue = f value | |
Success (newValue, remaining) | |
| Failure err -> | |
// if failed, return the error | |
Failure err | |
// return the inner function | |
Parser innerFn | |
let ( <!> ) = mapP | |
let ( |>> ) x f = mapP f x | |
let parseDigit = anyOf ['0'..'9'] | |
// 注意此函数中 .>>. 与 |>> 的对应关系: | |
// 每次 .>>. 都会生成一个 (x, y),(见 L61) | |
let parseThreeDigitsAsStr = | |
(parseDigit .>>. parseDigit .>>. parseDigit) |>> | |
fun ((c1, c2), c3) -> System.String [| c1; c2; c3 |] | |
let returnP x = | |
let innerFn input = | |
// ignore the input and return x | |
Success (x, input) | |
// return the inner function | |
Parser innerFn | |
let applyP fP xP = | |
// create a Parser containing a pair (f, x) | |
// 这里的 fP 是一个 Parser<'a -> 'b>,就是说当这个 Parser 解析成功后生成一个函数, | |
// 比如普通解析器遇到 'A` 返回 Success("A") 之类的具体值, | |
// 这个解析器遇到 'A' 返回 Success(fun x -> x + 1, remaining-string) | |
// 之类的函数作为解析结果 | |
(fP .>>. xP) | |
// map the pair by applying f to x, | |
// 即将 fP 里包裹的函数取出来 apply 到 xP 里包裹的值上 | |
|> mapP (fun (f, x) -> f x) | |
let ( <*> ) = applyP | |
// returnP f 将普通函数 f 转换为解析器包裹的函数,然后再 applyP 到后面的数值上 | |
let lift2 f xP yP = returnP f <*> xP <*> yP | |
// 用这种方法可以将任何 OOP 式的方法转换为 FP 式的函数调用 | |
let startsWith (str: string) (prefix: string) = str.StartsWith(prefix) | |
let startsWithP = lift2 startsWith | |
let rec sequence parserList = | |
// define the "cons" function, which is a two parameter function | |
// 用这种方法可以将任何不方便用管道符的方法(比如 ::、 .[0])转换为 FP 式的函数调用 | |
let cons head tail = head::tail | |
// lift it to Parser World | |
let consP = lift2 cons | |
// process the list of parsers recursively | |
match parserList with | |
| [] -> | |
returnP [] | |
| head::tail -> | |
consP head (sequence tail) | |
let parsers = [ pchar 'A'; pchar 'B'; pchar 'C' ] | |
let combined = sequence parsers | |
let charListToStr charList = charList |> List.toArray |> System.String | |
// match a specific string | |
let pstring str = | |
str | |
// convert to list of char | |
|> List.ofSeq | |
// map each char to a pchar | |
|> List.map pchar | |
// convert to Parser<char list> | |
|> sequence | |
// convert Parser<char list> to Parser<string> | |
|> mapP charListToStr | |
let parseABC = pstring "ABC" | |
let rec parseZeroOrMore parser input = | |
// run parser with the input | |
let firstResult = run parser input | |
// test the result for Failure/Success | |
match firstResult with | |
| Failure err -> ([], input) | |
// if parse fails, return empty list | |
| Success (firstValue, inputAfterFirstParse) -> | |
// if parse succeeds, call recursively | |
// to get the subsequent values | |
let (subsequentValues, remainingInput) = | |
parseZeroOrMore parser inputAfterFirstParse | |
let values = firstValue :: subsequentValues | |
(values, remainingInput) | |
/// match zero or more occurrences of the specified parser | |
let many parser = | |
let innerFn input = | |
// parse the input -- wrap in Success as it always succeeds | |
Success (parseZeroOrMore parser input) | |
Parser innerFn | |
let manyA = many (pchar 'A') | |
/// match one or more occurrences of the specified parser | |
let many1 parser = | |
let innerFn input = | |
// run parser with the input | |
let firstResult = run parser input | |
// test the result for Failure/Success | |
match firstResult with | |
| Failure err -> | |
Failure err // failed | |
| Success (firstValue, inputAfterFirstParse) -> | |
// if first found, look for zeroOrMore now | |
let (subsequentValues, remainingInput) = | |
parseZeroOrMore parser inputAfterFirstParse | |
let values = firstValue :: subsequentValues | |
Success (values, remainingInput) | |
Parser innerFn | |
let pint = | |
let resultToInt digitList = | |
// ignore int overflow for now | |
digitList |> List.toArray |> System.String |> int | |
// define parser for one digit | |
let digit = anyOf ['0'..'9'] | |
// define parser for one or more digits | |
let digits = many1 digit | |
// map the digits to an int | |
digits |> mapP resultToInt | |
let opt p = | |
let some = p |>> Some | |
let none = returnP None | |
some <|> none | |
let pint2 = | |
let resultToInt (sign,charList) = | |
let i = charList |> List.toArray |> System.String |> int | |
match sign with | |
| Some ch -> -i // negate the int | |
| None -> i | |
// define parser for one digit | |
let digit = anyOf ['0'..'9'] | |
// define parser for one or more digits | |
let digits = many1 digit | |
// parse and convert | |
opt (pchar '-') .>>. digits |>> resultToInt | |
/// Keep only the result of the left side parser | |
let (.>>) p1 p2 = | |
// create a pair | |
p1 .>>. p2 | |
// then only keep the first value | |
|> mapP (fun (a,b) -> a) | |
/// Keep only the result of the right side parser | |
let (>>.) p1 p2 = | |
// create a pair | |
p1 .>>. p2 | |
// then only keep the second value | |
|> mapP (fun (a,b) -> b) | |
let between p1 p2 p3 = | |
p1 >>. p2 .>> p3 | |
let sepBy1 p sep = | |
let sepThenP = sep >>. p | |
p .>>. many sepThenP | |
|>> fun (p,pList) -> p::pList | |
/// Parses zero or more occurrences of p separated by sep | |
let sepBy p sep = | |
sepBy1 p sep <|> returnP [] | |
/// "bindP" takes a parser-producing function f, and a parser p | |
/// and passes the output of p into f, to create a new parser | |
let bindP f p = | |
let innerFn input = | |
let result1 = run p input | |
match result1 with | |
| Failure err -> | |
// return error from parser1 | |
Failure err | |
| Success (value1, remainingInput) -> | |
// apply f to get a new parser | |
let p2 = f value1 | |
// run parser with remaining input | |
run p2 remainingInput | |
Parser innerFn | |
let (>>=) p f = bindP f p | |
(* | |
To run in fsi, you need to run: | |
#load "myParser.fsx";; | |
open MyParser;; | |
open MyParser.ParserLib1;; | |
open MyParser.ParserLib2;; | |
*) | |
open ParserLib1 | |
open ParserLib2 | |
System.String [| '1'; '3'; '5' |] // output: System.String = "135" | |
run parseThreeDigitsAsStr "123A" | |
let addP = lift2 (+) | |
run combined "ABCD" | |
run parseABC "ABCDE" | |
run parseABC "A|CDE" | |
run parseABC "AB|CDE" | |
// test some success cases | |
run manyA "ABCD" // Success (['A'], "BCD") | |
run manyA "AACD" // Success (['A'; 'A'], "CD") | |
run manyA "AAAD" // Success (['A'; 'A'; 'A'], "D") | |
// test a case with no matches | |
run manyA "|BCD" // Success ([], "|BCD") | |
let manyAB = many (pstring "AB") | |
run manyAB "ABCD" // Success (["AB"], "CD") | |
run manyAB "ABABCD" // Success (["AB"; "AB"], "CD") | |
run manyAB "ZCD" // Success ([], "ZCD") | |
run manyAB "AZCD" // Success ([], "AZCD") | |
let whitespaceChar = anyOf [' '; '\t'; '\n'] | |
let whitespace = many whitespaceChar | |
run whitespace "ABC" // Success ([], "ABC") | |
run whitespace " ABC" // Success ([' '], "ABC") | |
run whitespace "\tABC" // Success (['\t'], "ABC") | |
// define parser for one digit | |
let digit = anyOf ['0'..'9'] | |
// define parser for one or more digits | |
let digits = many1 digit | |
run digits "1ABC" // Success (['1'], "ABC") | |
run digits "12BC" // Success (['1'; '2'], "BC") | |
run digits "123C" // Success (['1'; '2'; '3'], "C") | |
run digits "1234" // Success (['1'; '2'; '3'; '4'], "") | |
run digits "ABC" // Failure "Expecting '9'. Got 'A'" | |
run pint "1ABC" // Success (1, "ABC") | |
run pint "12BC" // Success (12, "BC") | |
run pint "123C" // Success (123, "C") | |
run pint "1234" // Success (1234, "") | |
run pint "ABC" // Failure "Expecting '9'. Got 'A'" | |
let digitThenSemicolon = digit .>>. opt (pchar ';') | |
run digitThenSemicolon "1;" // Success (('1', Some ';'), "") | |
run digitThenSemicolon "1" // Success (('1', None), "") | |
run pint2 "123C" | |
run pint2 "-123C" | |
// use .>> below | |
let digitThenSemicolon2 = digit .>> opt (pchar ';') | |
run digitThenSemicolon2 "1;" // Success ('1', "") | |
run digitThenSemicolon2 "1" // Success ('1', "") | |
let ab = pstring "AB" | |
let cd = pstring "CD" | |
let ab_cd = (ab .>> whitespace) .>>. cd | |
run ab_cd "AB \t\nCD" // Success (("AB", "CD"), "") | |
let pdoublequote = pchar '"' | |
let quotedInteger = between pdoublequote pint pdoublequote | |
run quotedInteger "\"1234\"" // Success (1234, "") | |
run quotedInteger "1234" // Failure "Expecting '"'. Got '1'" | |
let comma = pchar ',' | |
let zeroOrMoreDigitList = sepBy digit comma | |
let oneOrMoreDigitList = sepBy1 digit comma | |
run oneOrMoreDigitList "1;" // Success (['1'], ";") | |
run oneOrMoreDigitList "1,2;" // Success (['1'; '2'], ";") | |
run oneOrMoreDigitList "1,2,3;" // Success (['1'; '2'; '3'], ";") | |
run oneOrMoreDigitList "Z;" // Failure "Expecting '9'. Got 'Z'" | |
run zeroOrMoreDigitList "1;" // Success (['1'], ";") | |
run zeroOrMoreDigitList "1,2;" // Success (['1'; '2'], ";") | |
run zeroOrMoreDigitList "1,2,3;" // Success (['1'; '2'; '3'], ";") | |
run zeroOrMoreDigitList "Z;" // Success ([], "Z;") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment