Last active
September 2, 2018 13:06
-
-
Save valtih1978/5aafe15e6fcc7c4a8933ae78116e4194 to your computer and use it in GitHub Desktop.
javascript-based parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<script id="library" title="it must be a part of starndard javascript"> | |
const log = console.log ; const assert = console.assert | |
update = (o, mutator) => {mutator(o); return o} | |
trace = (value, format) => {format = format || (data => data) ; log(format(value)); return value}; | |
tracejs = (msg, data) => trace(data, data => [msg, data.json()]) | |
Object.prototype.trace = function(format) {trace(this, format)} | |
function range (len, begin) {begin = begin || 0 | |
return Array.apply(null, Array(len)).map((_, i) => {return begin+i;})} | |
Object.prototype.entries = function() {return Object.keys(this).map(k => [k, this[k]])}; | |
//Array.prototype.fill = function(filler) {return this.map(_ => filler)} // startard func | |
Array.prototype.equals = function(that) {return this.every((my, i) => my == that[i] )} ; [1,2].equals([1,2]) | |
Array.prototype.sum = function() {return this.reduce((acc, el) => acc + el,0)} | |
Array.prototype.product = function() {return this.reduce((acc, el) => acc * el,1)} | |
Object.prototype.json = function() {return JSON.stringify(this)} | |
Array.prototype.last = function() {return this[this.length-1]} | |
Array.prototype.mapField = function(key, f){ return this.map( o=>Object.assign({},o,{[key]:f(o.key)}) ) } | |
Array.prototype.sliceIndex = function(i) {return this.map(e => e[i])} | |
Array.prototype.mapIndex = function(i, f){ return this.map( child =>Object.assign([], child, {[i]:f(child[i])} ) )} | |
Array.prototype.equals = function(that) {return this.length == that.length && this.every((v,i)=> v === that[i])} | |
Array.prototype.zip = function(...those) {return this.map((my, i) => those.reduce((acc, that) => [...acc, that[i]],[my])) } //; [11,22,33].zip([1,2,3]).json() | |
String.prototype.toArray = function() {return range(this.length).map(i => this.charAt(i))} | |
Array.prototype.unzip = function() {return this[0].map((_, i) => this.map(sub => sub[i]))} | |
Map.prototype.getOrElse = function(key, value) { return this.has(key) ? this.get(key) : value} | |
Array.prototype.partition = function(filter) {return this.reduce(([passed, failed], el) => {(filter(el) ? passed : failed).push(el); return [passed, failed]}, [[],[]])} | |
Array.prototype.collapseAsEnv = function() { return this.reduce((res,scope) => Object.assign(res, scope), {})}; | |
Map.prototype.getOrElse = function(key, value) { return this.has(key) ? this.get(key) : value} | |
Map.prototype.withDefaultValue = function(defaultValue) { | |
const result = new Map([...this.entries()]); | |
const getWas = result.get; result.get = (key) => | |
result.has(key) ? getWas.call(this, key) : defaultValue; | |
return result | |
} | |
merge = (f, ...trees) => { | |
const [head, ...tail] = trees | |
return (typeof head == "object") ? | |
head.zip(...tail).map(trees => merge(f, ...trees)) : f(...trees) | |
} ; merge((a,b) => a+b , [1,2,3], [11,22,33]) | |
RegExp.prototype.execAllGen = function*(input) { | |
for (let match; (match = this.exec(input)) !== null;) | |
yield [match, this.lastIndex]; | |
} ; RegExp.prototype.execAll = function(input) { | |
return [...this.execAllGen(input)]}; | |
//const matches = (/[a-z]+/g).execAll(" abc.xyz\n lkm") ; log("captured strings:", matches.map(m=>m[1])) | |
Array.prototype.flatten = function() {return this.reduce((acc, e) => [...acc, ...e],[])} | |
Array.prototype.flatMap = function(f) {return this.map(e => f(e)).flatten()}; | |
//my sucks http://cs.stackexchange.com/questions/65962/anatomy-of-representing-of-cartesian-product-in-sw | |
// http://eddmann.com/posts/cartesian-product-in-javascript/ | |
const cartesian = (...sets) => sets.reduce((acc, set) => | |
acc.flatMap(x => set.map(y => [ ...x, y ])), [[]]); | |
const percent = p => Math.round(p * 100) | |
Array.prototype.isAscending = function(strict) {const greater = (a,b) => strict ? a > b : a >= b; | |
return this.length == 0 ? true : | |
this.reduce(([result, max], a) => [result && greater(a, max), a], [true, this[0]-1])[0] | |
} ; | |
//[false, true].forEach(strict => log([1,2,3,3].isAscending(strict), [1,2,3].isAscending(strict), [1,3, 2].isAscending(strict))) | |
var Some = function(value) { this.value = value;}; | |
Some.prototype.get = function() {return this.value} | |
Some.prototype.map = function(f) {return new Some(f(this.value))} | |
Some.prototype.isEmpty = function() {return false} | |
var None = function() { }; | |
None.prototype.get = function() {throw "attempt to get value from None"} | |
None.prototype.map = function(f) {return this} | |
None.prototype.isEmpty = function() {return true} | |
Try = function() {return {// children will create their class instance | |
makeSuccess: function(value) {return new Success(value)}, | |
makeFailure: function(error) {return new Failure(error)}, | |
}} | |
success = (value) => new Success(value); | |
var Success = function(value) { | |
this.success = value;}; | |
Success.prototype = Object.assign(new Try(), { | |
isFailure: () => false, isSuccess: () => true, | |
map: function(f) {return this.makeSuccess(f(this.success));}, | |
flatMap: function(f) {return f(this.success);}, | |
getOrElse: function(value) {return this.success;}, | |
get: function() {return this.success;}, | |
recover: function(f) {return this}, | |
}) | |
fail = (error) => new Failure(error); | |
var Failure = function(error) { this.error = error} | |
Failure.prototype = Object.assign(new Try(), { | |
isSuccess: () => false, isFailure: () => true, | |
map: function(f) {return this}, | |
flatMap: function(f) {return this}, | |
getOrElse: function(value) {return value}, | |
getError: function() {return this.error}, | |
get: function() {throw this.error}, | |
recover: function(f) {return f(this.error)}, | |
makeSuccess: function(value) { | |
log("makeSuccess2", value) | |
return success(value)}, // children will create their class instance | |
}) | |
</script> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!!!! I see that Eric Mayer discusses all three features that I wanted to address with my implementation | |
in one chapter "White-space, comments, and keywords" of http://www.cs.nott.ac.uk/~pszgmh/monparsing.pdf | |
I came up with the idea that oneOf | operator represtnts a plus + because BNF can be thougth of as a | |
Sum of Products. But, Eric points out that PEG is a limited version of CFG parser, which admits | |
ambiguities and, when alternatives bring the results, you add results together (all different ways | |
to parse the same string) thus, giving another reason to name it a plus operator. | |
!!!!! | |
// The basic idea of parser combinators is that A B | C D is identical to | |
A andThen B alternatively C andThen D. | |
And-Then is a default combinator. It binds stronger than alternatively, just like & is stronger than | |
| in logic A & B | C & D and * is stronger than arithmetical +. Now, you see that the andThen and alter | |
natively combinators _appear between_ the parsers. Here A B are references to other rules and there | |
is a parser per every rule in parser combinator. That rule is parsed with that parser. There are also | |
higher order parsers may appear, like opt(A), ep(A) or oneOf(parser1, parser2, parser3) -- the latter | |
is viable alternative to .alternatively combinator. They are higher order functions (no pun intended). | |
rather than parser combinators because parsers are functions that convert input stream into ParseResult, | |
which is a monad that encapsulates the parse result (a piece of AST produced by parser from the stream) | |
and the rest of the stream after parser has consumed a initial part of it. This allows to declare the | |
andThen and alternatively methods in the monad. AndThen will be invoked if parse result was successful | |
and alternatively will be invoked if it was not. Yes, ParseReult modand is and extension of Try monad | |
with ParseSuccess and ParseFailure subclasses. | |
ParseFailure's andThen method does nothing -- in returns the error it contains further, whithout | |
attempt to parse anything. It's alternatively method is alive though, it takes the input from the place | |
of failure and applies it to the argument parser, C in our example. This allows a recovery and successful | |
parse with alternative sequence of rules. ParseSuccess is opposite: its andThen takes the partially | |
consumed stream it contains, applies the parser and, in the case of success, when second parser produces | |
a result not failing, it creates a pair [this.result, next.result]. Reult can be updated in the ParseSuccess | |
monad with the standard map/flatMap methods. Because however the ParseSuccess extends the Success using | |
[astValue, restOfTheStream] as its success value and we want to update the astValue produced by parser | |
leaving the rest of the input stream intact, .userMap(astValue => ...user value update code...) method | |
is provided for convince. | |
All about text says that `A andThen B alternatively C andThen D` is technically incorrect. In fact, | |
the parser is | |
input => A(input) andThen B alternatively C andThen D | |
It is a funciton that produces A-monad once given the input stream, which where combinators | |
trigger and bind the rest of computations. It is however quite annoying to have a pattern | |
input => p1(input)... | |
all over the code. I should probably make combinators the methods of the Parser function | |
and forget about ParseResult, or duplicate the monadic methods in both ParseResult and parser. | |
// pass and fail: whenever we return a success, we eat something. Higer | |
order combinators do not eat the stream characters directly. They do not create the primary results -- | |
they combine the lower order results. | |
// monadic parse results. Done: | |
// 1. realized that the top rule must be additive rather than multiplicative, | |
// 3. Whitepace stripping between the reules. Whenever stream is created (and, it is created on chars taken advance rather than mutated), we strip the leading spaces. | |
// 4. optional &-sign (as it is in math and BNF, yes sequence of nodes on the rgith hand is corresponds to & operation whereas alternative | is or-operation) skipping and rule references with arguments | |
// 5. parenthized sub-alternatives. This corresponds to the fact that we need parentheis | |
// if we want addition within a group to bind stronger than surrounding AND-s | |
// TODO: | |
// 1.Done literal calls to parametrized if parenthized expression is prefixed with rule identifier (without & symbol). | |
// 2.Done Simplify semantic actions. Currently, we have to map over [result, input] but parser users need to map only the result projection in most cases. | |
// 3. Parser generation (with keywords) | |
// 4. Random trees | |
// It is (going to be) invoked to build both parser generators and ranomd ranomd tree generators | |
//grammar = parse(grammar); | |
//grammar.recognizer().parse(input) | |
//grammar.generator().random() | |
Yes, it makes sense to do in javascript because tree generation is fast and we can send the programs for evalution | |
to Erlang, which is parallel and provides good isolation for proesses. | |
// In fact, "a b c d" stands for "a & b & c & d", which stands for "a & (b & (c & (d)))" because it is | |
// the same as a.andThen(b.andThen(c.andThen(d))). But, for the associative operations, we prefer the first form. | |
// With botOrFirst operator, I can write and parse the associative expressions like | |
// botOrFirst(multiplicative, "|".overrideWith(multiplicative) ) which gives me the list of addition operands | |
// in the plain form (+ a b c d ...) instead of ugly nest a+(b+(c+(d+...))). Moreover, this eliminates the need | |
// for late bindings necessary in "or = and + or | and" -- not or is defined throug or. | |
// if skip is on, input stream will use our parser and discard the results | |
// We won't use oneOf(whitespaces.asArray9().map(literal)) to allow parser use | |
// without dependencies and for performance reasons. | |
/* | |
a big coding disadvantage of my monadic approach is that I must type the input => parser(input) | |
input => firstParser(input).andThen(combinators).overrride(thatIwant) | |
everywhere. I was also lazy to implement andThen in the ParseResult monad, which is outght to be | |
extendsion of Try monad and placed then into Object.prototype. Alternative could try to attach | |
the monadic methods to the parsers. Parser.andTHen must initialize parser's 'next' field such that | |
when it gets a result, it makes the next action, passing its result as an argument. | |
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// DONE: stringification and even some parsification after parsnthesis are made mandatory to create | |
// label scopes. TODO: replace parenthesis with mandatory + nodes. | |
// TODO: stream(string) into string.toStream() since parser("abc".toStream()) is easier to read than parser(stream("abc")) | |
// Moreover, do "abc".parse(parser) instead of parser(stream("abc")) | |
// TODO: eliminate parenthesis nodes -- replace them with mandatory "+". The parenthesis are mandator now | |
// anyway for label scoping. Translation will start with the {} in the stack, which corresponds to the | |
// default +/& scope and every + will create it subscope stack frame as () did previously. ` | |
// Let's use () nodes for referenced rule arguments since we do not need two kinds of parenthesis anymore! | |
tracejs = (msg, data) => trace(data, data => [msg, data.json()]) | |
logRem = (msg, input) => log(msg + " from", input.rest()) | |
plog = (string, parser) => {let n = parser.name; if (n == "") n = "anonym parser" | |
tracejs(n /*+ "(" + string + ") = "*/, parser(stream(string)))} | |
// I could extend the Object.prototype with ParseResult methods like .andThen but | |
// let's pay coding prise and the performance penalty for extending the ParseResults only | |
function ParseError(error) { | |
Object.assign(this, Failure.prototype); Failure.call(this, error) | |
this.makeSuccess = function(value) {return new ParseSuccess(value)} //teach Success to make ParseResults | |
}; | |
function ParseSuccess(result) { | |
Object.assign(this, Success.prototype); Success.call(this, result); | |
this.makeSuccess = function(value) {return new ParseSuccess(value)} // set our constructor for Success | |
} | |
const ParseResult = ParseError.prototype = ParseSuccess.prototype = {};// monadic methds to be added | |
//success = result => new ParseSuccess(result) ; fail = (error) => new ParseError(error) | |
consumeWhile = (condition) => input => {let offset = 0; for (; offset < input.len() && | |
condition(input.charAt(offset)) ; offset++) ; return input.pass(input.rest(offset), offset)}; | |
consumeFollowingChars = (chars) => consumeWhile(char => chars.indexOf(char) != -1) | |
skip20TabCrNl = consumeFollowingChars(" \t\r\n") | |
stream = (string, options) => { | |
const result = Object.assign({source: string, offset:0, skip:skip20TabCrNl, | |
peek: function() {return string.charAt(this.offset)}, | |
charAt: function(at) {return string[this.offset + at]}, | |
fail: function(msg) {return new ParseError({at: this.offset, msg: msg})}, | |
len: function() {return string.length - this.offset}, | |
rest: function(len) {return len == undefined | |
? string.substring(this.offset) | |
: string.substring(this.offset, len + this.offset)}, | |
clone: function(offset, options) { | |
return stream(string, Object.assign({ | |
offset:this.offset+offset, skip:this.skip}, options))}, | |
eat: function(nChars) {return nChars == 0 ? this : this.clone(nChars)}, | |
pass: function(result, consumed) {return new ParseSuccess([result, this.eat(consumed)])} | |
}, options); | |
return result.skip(result).get()[1] | |
} | |
checkParseResult = (inversion, input, parser, expected) => { | |
let [name, parseResult] = parser.isSuccess != undefined ? | |
[input, parser] : [parser.name, parser((typeof input == "string") ? stream(input) : input)] | |
parseResult = parseResult.flatMap(([presult,rem]) => | |
expected != undefined && expected.ast != undefined && presult.json() != expected.ast.json() | |
? fail("parsed result "+presult.json() +" whereas "+expected.ast.json() + " expected") | |
: success([presult, rem])).flatMap(([pres,rem]) => | |
expected != undefined && expected.rem != undefined && expected.rem != rem.len() | |
? fail("There are " + rem.len() + " chars remains in the stream wheras we wanted " + expected.rem) | |
: success([pres, rem]) ) | |
//const spec = parser //[parser.name, "(\"" + input + "\")"].json(); | |
const result = inversion(name, parseResult) | |
if (result.isFailure()) console.error(result.getError()) ; return result | |
} ; | |
mustFail = (...checkArgs) => { return checkParseResult((parser, checked) => checked.isFailure() | |
? success([parser, "detected user input error", checked.getError().json(), "as expected"]) | |
: fail(parser + " had to fail, yet, produced Success(" + checked.get().json() + ")") | |
, ...checkArgs); | |
} | |
mustPass = (...checkArgs) => { return checkParseResult((parser, checked) => checked.isSuccess() | |
? success({parser:parser, parseResult:checked.get().json()}) | |
: fail({parser:parser, err:checked.getError().json()}), ...checkArgs); | |
} | |
mustPass(" \ta", input => input.pass(input.peek(), 1), {ast: "a"}) // ensure that sw stripped and a consumed | |
mustFail(" \ta", input => input.fail("whatever")) // input and parser are send to checker | |
mustPass("", stream("abc").pass(1, 0), {ast:1}) // parser name and parse result are send to checker | |
// show that we consume whitespaces from as stream that does not do that for us | |
keepWhitespaces = (string) => stream(string, {skip:consumeFollowingChars("")}) | |
successParser = (result) => input => input.pass(result, 0) | |
mustPass(keepWhitespaces(" \ta"), successParser(), {rem:3}) // check that nothing was consumed | |
mustPass(keepWhitespaces(" \ta"), consumeFollowingChars(" \t\n\r"), {rem:1}) // a is left in stream | |
if (false) {const abc = stream("abc"); const bc = [abc, abc.eat(1)] | |
log(bc.map(s => [s.rest(), s.peek()]))} | |
named = (name, f) => update(f, f => {Object | |
.defineProperty(f, "name", {writable: true}); f.name = name}) | |
literal = literal => named("literal(" + literal + ")", (input) => { | |
const match = input.rest(literal.length); | |
return match == literal ? input.pass(match, literal.length) | |
: input.fail("expected "+literal + " whereas got " + match) | |
}); | |
mustPass("abc", literal("ab")) ; mustFail("aab", literal("ab")) | |
weUseRe = re => named(re, input => {const match = input.rest().match(re); | |
return match != null && match.index == 0 | |
? input.pass(match[0], match[0].length) | |
: input.fail("expected pattern " + re + " whereas got " + input.rest(20)) }); | |
mustPass("abc", weUseRe(/ab/), {rem:1}) | |
ParseResult.userMap = function(f) {return this.map(([res, input]) => [f(res), input])} | |
ParseResult.userFlatMap = function(f) {return this.flatMap(([res, input]) => f(res))} | |
oneOf = (...parsers) => { | |
const specializedName = "oneOf " + parsers.map(p => p.name) | |
return named(specializedName, input => parsers.reduce((failed, parser) => | |
failed.recover(_ => parser(input)), input.fail("oneOf seed with alternatives " + specializedName)) | |
.recover(_ => input.fail(specializedName + " expected whereas got " + input.rest(20)))) | |
} | |
ParseResult.andThen = function(...parsers) { return this.flatMap(([myResult, input]) => | |
oneOf(...parsers)(input).userMap(hisResult => [myResult, hisResult]))} | |
ParseResult.andThen = function(another) { return this.flatMap(([myResult, input]) => | |
another(input).userMap(hisResult => [myResult, hisResult]))} | |
mustPass("a & ab1", input => literal("a")(input).andThen(literal("&")).andThen(oneOf(literal("c"), literal("a")))) | |
ParseResult.orElse = function(thatResult) {return this.recover(_ => thatResult)} | |
mustPass("a&ab", i => literal("b")(i).orElse(successParser("alt")(i)), {ast:"alt"}) | |
ParseResult.overrideWith = function(parser) {return this.flatMap(([_, input]) => parser(input))} | |
overrideP = (skipedParser, replacement) => input => skipedParser(input).overrideWith(replacement) | |
mustPass("&c", input => literal("&")(input).overrideWith(literal("c")), {ast:"c"}) | |
mustPass("&c", overrideP(literal("&"), literal("c")), {ast:"c"}) | |
ParseResult.skipNext = function(parser) {return this.flatMap(([my, input]) => parser(input).userMap(_ => my))} | |
enclosed = (...args) => { // args either (left, content, right) or ("lr", content) where lr is something like [] | |
const [lp, content, rp] = args.length == 2 ? [args[0][0], args[1], args[0][1]] : args | |
return named(`${lp}${content.name}${rp}`, input => literal(lp)(input).overrideWith(content) | |
.skipNext(literal(rp)).userMap(content => /*({class: lp+rp, value:content})*/[lp+rp, content])) | |
} | |
pared = content => enclosed("()", content) ; mustPass("(abc)", pared(literal("abc"))) | |
// undefined spec captures any literal, specified caputres specific one | |
const QQ = "\"\""; | |
stringLiteral = (spec) => enclosed(QQ, spec == undefined ? weUseRe(/[^"]*/) : literal(spec)) | |
mustPass('"abc"', stringLiteral("abc"), {ast:[QQ, "abc"]}); mustPass('"abc"', stringLiteral()); | |
identifierParser = weUseRe(/[a-zA-Z][a-zA-Z0-9]*/) | |
theyUseRe = enclosed("//", weUseRe("[^/]+")) // plog("/ab[c]+/", theyUseRe) | |
ujsBrackets = new Map([["}", -1], ["{", +1]]).withDefaultValue(0) | |
userJsAction = enclosed("{}", input => (() => { | |
const loop = (eat, deficit) => deficit == 0 ? input.pass(input.rest(eat-1), eat-1) : | |
eat == input.len() ? input.fail("could not find java end") : | |
loop(eat + 1, deficit + ujsBrackets.get(input.charAt(eat))) | |
return loop(0, 1)})()) | |
lb = name => named("lb_" + name, (...args) => eval(name)(...args)) // late binding | |
bothOrFirst = (a, b, map) => named(a.name + " or " + a.name + " ~ " + b.name, | |
input => {const A = a(input); return A.andThen(b).userMap(ab => map(...ab)).orElse(A)}) | |
//const args = bothOrFirst(lb("or"), input => literal(",")(input).overrideWith(args), | |
//(a,b) => [",", a].concat(b[0] == "," ? b.slice(1) : b)) | |
const args = bothOrFirst(input => or(input).userMap(arg => [arg]), | |
input => literal(",")(input).overrideWith(args), (a,b) => [...a,...b]) | |
primary = named("primary", | |
oneOf(stringLiteral(), | |
bothOrFirst(identifierParser, pared(args), (rule,[par,args]) => ["()", rule, ...args]), | |
theyUseRe, userJsAction | |
, input => pared(or)(input).userMap(([par, terms])=> terms) )) | |
mustPass("abc", primary) | |
ParseResult.binMap = function(f) {return this.userMap(([a,b]) => f(a,b))} | |
bin = (name, binMap, a, ...b) => named(name, input => { const seed = a(input) | |
return seed.andThen(...b).binMap(binMap).orElse(seed)}) | |
// Actually, I could define bin "withOption" combinator, like | |
ParseResult.withOption = function(...parsers) { | |
return this.andThen(...parsers).orElse(this)} | |
unfortunateBin = (op, a, ...b) => input => a(input).withOption(...b) | |
.binMap(binSignMap(op)) // here is the problem | |
// But, I would not be able to map the seed-option paris as I did in the bin in this case | |
opt = (bodyP) => named("optional " + bodyP.name, input => (bodyP)(input) | |
.userMap(result => new Some(result)).orElse(successParser(new None())(input))) | |
mustPass("11", opt(literal("1")), {rem: 1}); mustPass("11", opt(literal("2")), {rem: 2}) | |
labelledPrimary = named("labeledPrimary", | |
input => opt(input => identifierParser(input).skipNext(literal(":")))(input).andThen(primary) | |
.userMap(([label, primary]) => label.isEmpty() ? primary : [":", label.get(), primary])) | |
mustPass("aa", labelledPrimary, {ast:"aa"}); | |
mustPass("a2:aa", labelledPrimary, {ast:[":","a2","aa"]}); | |
//mustPass("a:(aa)", labelledPrimary, {ast:[":","a",["()", "aa"]]}); // or is not defined for that | |
binSignMap = (sign) => (a,b) => "(" + a + sign + b + ")" | |
// after &, parenthesis are removed in AST to make difference with function calls | |
// This has nice side effect that a & (b c) is converted into &(a,b,c) but | |
// downside of this is that al:a & (bl:b) is converd into al:a & bl:b and we have label scope mixing error | |
parAfterAmpersand = input => overrideP(literal("&"), labelledPrimary)(input) | |
.userMap(res => res[0] == "()"? res[1]: res) | |
// Our second version of and/or rules. Here, we flatten the nest of & binary subexpressions | |
// because we do not have the rep | |
and = bin("and", | |
// binSignMap("*") | |
//(a,b) => "(" + a + "&" + b + ")" | |
//(a,b) => a + "&" + b | |
//(a,b) => ({class:"&", a:a, b:b}) | |
//(a,b) => ["&", a, b] | |
(...args) => args.reduce((acc, a) => a[0] == "&" ? [...acc, ...a.slice(1)] : [...acc, a], ["&"]) | |
//(...args) => and(...args) below is a late binding! | |
, primary, | |
lb("and"), parAfterAmpersand) | |
or = bin("or", | |
//binSignMap("+") | |
(...args) => args.reduce((acc, a) => a[0] == "+" ? [...acc, ...a.slice(1)] : [...acc, a], ["+"]) | |
, and | |
, overrideP(literal("|"), lb("or"))) | |
if (false) plog("x y z (k l | m t | p | f) a b c)", and) | |
// after &, parenthesis are removed in AST to make difference with function calls | |
if (false) plog("(a| m) a c & (a | l | t) | a c & d (a r g) (r | m) ", or) | |
flatTail = ([a,b]) => [a, ...b] | |
//kneele star, rep(a) = a ~ rep(a) | ε | |
rep = (bodyParser) => named("rep of " + bodyParser.name, input => oneOf( | |
input => (bodyParser)(input).andThen(rep(bodyParser)) | |
.userMap(flatTail), successParser([]))(input)) // there will be a failure in the end and we cure it with empty array | |
//kneele plus, rep1(a) = a ~ rep(a) | |
rep1 = (bodyParser) => named("rep1 of " + bodyParser.name, | |
input => bodyParser(input).andThen(rep(bodyParser)).userMap(flatTail)) | |
const [la,comma] = ["a", ","].map(literal) | |
;["aaa", ""].map(input => mustPass(input, rep(la))); | |
mustPass("aa", rep1(la)); mustFail("baa", rep1(la)); | |
// rep1sep(a, sep) = a ~ rep(sep ~> a) | |
// repsep(a, sep) = rep1(sep ~> a) | ε | |
rep1sep = (item, sep) => named("rep_" + item.name + "_sep_" + sep.name, | |
bothOrFirst(item, rep(input => sep(input).overrideWith(item)), (a,b)=> [a, ...b])) | |
repsep = (item, sep) => named("rep1_" + item.name + "_sep_" + sep.name, | |
oneOf(rep1sep(item, sep), successParser([]))) | |
if(true) ["a,a,a", ""].map(input => mustPass(input, repsep(la, comma)) ); | |
mustPass("a,a,a", rep1sep(la, comma)); mustFail("", rep1sep(la, comma)); | |
// 3rd version of and/or parsers which flattens the binary nest automatically, by virtue of rep parser | |
//and = botOrFirst(labelledPrimary, rep1(labelledPrimary, parAfterAmpersand)), (a,b) => ["&", a, ...b]) | |
//and = input => rep1sep(labelledPrimary, opt(literal("&")))(input).userMap(items => ["&", ...items]) // "b" => [&,b] | |
and = named("and", input => {const anded = bothOrFirst( | |
labelledPrimary, rep1(input => opt(literal("&"))(input).overrideWith(labelledPrimary)), | |
(pr,rep) => ["&", pr, ...rep])(input) | |
return anded.userMap(literals => {//const strip = literals[0] == ":"; | |
// if(strip) console.warn("label", literals[1], "makes no sense -- it will be stripped from ast"); | |
// return strip ? literals[2] : literals})}) | |
return literals[0] == ":" ? ['&', literals] : literals})}) | |
or = named("or", bothOrFirst(and, rep1(overrideP(literal("|"), and)), (seed,tail) => ["+", seed, ...tail])) | |
//plog("a (c) b1:b & (c d) (kk:k|l)", or) | |
// check the parse pass and fail themselves | |
if (false) { mustPass("aa", primary, {ast:"aa", rem:0}).json() | |
log(1, [mustFail("&", primary), mustFail("a", primary), | |
mustFail("a", primary, ["ab",2])].map(r => r.json() + "\n")); mustPass("a", primary).json()} | |
testReport = (testResults) => {const failed = testResults.filter(tr => tr.isFailure()); | |
const successCount = testResults.length - failed.length | |
if (successCount == testResults.length) log("all " + testResults.length + " tests succeeded"); | |
else log("only", successCount, "tests succeded whereas", failed.length, "failed"); | |
} ; testReport([mustFail("&", primary), mustPass("a", primary), mustPass("a", primary, {ast:"a", rem:0}), | |
mustPass("x (r)", or, {ast:["()","x","r"]}), mustPass("x (y (k,l))", or, {ast:["()","x",["()", "y", "k", "l"]]}), | |
mustPass("(/a/| m) c | a1 c & dd:d (mm:m | l n) \"a\" bb:(b b)", or, {ast:["+",["&",["+",["//", "a"],"m"],"c"], | |
["&","a1","c",[":","dd", | |
["()","d",["+",["&", [":", "mm", "m"]],["&","l","n"]]]], | |
[QQ, "a"],[":","bb",["&","b","b"]]]], rem:0})]) | |
eos = input => input.len() == 0 ? input.pass("eos", 0) : input.fail("eos expected but " + input.rest(10) + " found") | |
ParseResult.finished = function() {return this.skipNext(eos)} | |
mustPass("", eos) ; mustFail("a", eos) ; mustPass("a", input => (literal("a")(input)).finished()) | |
// The high-level architecture | |
// We parse the grammar. It consists of the rules like | |
// name (optional args) = rhs, separated by \n. Our parsifier creates a parer from the rhs AST. | |
// This parser may refer other rules/parsers. When this parser is asked to parse user iniput, | |
// it will pass the other parser - arguments and call the referenced parser. | |
grammar = ` | |
// after math a b c x + x y z = +(·(a,·(b,·(c,d)), ·(x,·(y,z))) | |
// we can type y = a b c | k l m, abbreviating the fully-qualified syntax y = a & b & c | k & l & m | |
// but we cannot ommit the & in case of parethesis, y = a & (b | c) since a(b | c) will call rule a with arguments | |
start | |
= additive | |
additive | |
= left:multiplicative "+" right:additive { return left + right; } | |
| multiplicative | |
multiplicative | |
= left:primary "*" right:multiplicative { return left * right; } | |
| primary | |
primary | |
= integer | |
| "(" additive:additive ")" { return additive; } | |
integer | |
= digits:/[0-9]+/ { return parseInt(digits.join(""), 10); }` | |
comments = input => skip20TabCrNl(input).overrideWith(input => | |
(input.len() < 2 || input.charAt(0) != '/' || input.charAt(1) != '/') ? | |
input.pass("", 0) : consumeWhile(c => c != '\n')(input).andThen(comments) | |
) | |
mustPass("// this is a comment\n aaa", comments, {rem:3, ast:["// this is a comment", ""]}) | |
mustPass(stream(" ///behind two \n // lines 2 \n x", {skip:comments}), literal("x")) | |
mustPass(stream(" ///behind two \n // lines 2 \n ", {skip:comments}), eos) | |
stripLeadingComments = (string) => comments(stream(string/*, {skip:comments}*/)) | |
mustPass("stripping comments", stripLeadingComments( // we an get comments if they are not skipped | |
" //1st \n //2nd line "), {rem:0, ast:["//1st ",["//2nd line ", ""]]}) | |
ruleParser = input => identifierParser(input).andThen(opt(pared(rep1sep(identifierParser, literal(","))))).skipNext(literal("=")).andThen(or).finished() | |
mustPass(stream("a=b // c", {skip:comments}), ruleParser) | |
// todo: we do not need scopeChain in the stringify. It is for parsify where we will execute javascript | |
const stringify = (rhs) => { | |
const stringifyAComposite = () => { const [tag, ...args] = rhs; | |
const join = separator => args.map(stringify).join(separator) | |
return ["{}", QQ, "//"].indexOf(tag) != -1 ? tag[0] + args[0] + tag[1] : { | |
"()": () => args[0] + "(" + stringify(args[1]) + ")", | |
":": () => args[0] + ":" + stringify(args[1]), | |
"+": () => join("|"), "&": () => join("&") , | |
}[tag]()} ; return typeof rhs == "string" ? rhs : stringifyAComposite() | |
} | |
a = "la: aa lb:(lc:{js1}) f(xx:x|y) {js2}"; plog(a, or); log(stringify(or(stream(a)).get()[0], [{}])) | |
// ToDo: in addition to stringify, make rnd and parsify functions. We could use random | |
// trees stringified to test our generated parsers. We will need randexp! | |
backToTextCheck = (right) => { const unparsedRule = "integer = " + right | |
const parsed = stripLeadingComments(unparsedRule).overrideWith(ruleParser) | |
const noWS = str => str .replace(/\s*/mg, "") | |
mustPass(unparsedRule + " stringification test", parsed.userMap( | |
([left, right]) => left + " = " + stringify(right)).userMap(noWS) | |
.flatMap(([stringified, input]) => stringified == noWS(unparsedRule) ? | |
input.pass(stringified,0): input.fail(stringified + " != "+ noWS(unparsedRule))) ) | |
} | |
;['b:a', 'b:/a/ & "c" ', 'k & l', 'k|l', 'f(x) | l1:g (l2:/y/ & "z") & {js1}'].forEach(backToTextCheck) | |
const parseRhs = (grammar) => stripLeadingComments(grammar).overrideWith(or).finished() | |
const rhsCheck = (kind, grammar, input) => kind("applying grammar " + grammar + " to " + input, | |
parseRhs(grammar).userFlatMap((ast) => parsify(ast, new Set())(stream(input)))) | |
const rhsPass = (...args) => rhsCheck(mustPass, ...args) | |
const rhsFail = (...args) => rhsCheck(mustFail, ...args) | |
//rhsFail(`"a" "a" "c"`, "aab") | |
//defined = []; | |
pInfo = {}, l2r = [{}], used = new Set(); env = [{}] | |
// Stringify, parsify or both https://softwareengineering.stackexchange.com/questions/336243 | |
parsify = (rhs) => { | |
if (typeof rhs == "string") rhs = ["()", rhs]; | |
const [tag, ...args] = rhs; | |
const join = separator => args.map(parsify).map(p => p.name).join(separator) | |
return { | |
[QQ]: () => named('"' +args[0] + '"', literal(args[0])), | |
"//": () => named('/' +args[0] + '/', weUseRe(args[0])), | |
"{}": () => named('{' +args[0] + '}', input => {const myEnv = env.collapseAsEnv() | |
return input.pass(eval("with (myEnv)" + args[0]), 0)}), | |
":": () => {const parser = parsify(args[1]); return named (args[0]+":"+parser.name, | |
input => parser(input).userMap(res => env.last()[args[0]] = res))}, | |
"()": () => {const f = args.shift(); used.add(f); | |
const pa = args.map(parsify) ; | |
return named(f + "("+pa.map(a => a.name).join(",")+")", | |
//defined[f](...pa) // immediate is easy and fast but won't handle recursions | |
//input => defined[f](...pa)(input) // so, resolve refs at parser use time. | |
input => {; const spec = f + "("+args.join(",") + ")" | |
const ref = l2r.collapseAsEnv()[f] | |
log("defined", spec, l2r.collapseAsEnv()) | |
if (ref == undefined) throw spec +" is undefined" | |
const namedArgs = pInfo[f].reduce((res, chName, i) => // pos to named | |
Object.assign(res, {[chName]: pa[i]}), {}) | |
log("named args", namedArgs) | |
return l2r.withPush(namedArgs, () => ref(input)) | |
return ref(input)} | |
)}, | |
"+": () => named(join("|"), oneOf(...args.map(parsify))), | |
"&": () => {const children = args.map(parsify) | |
// Here is the issue | |
// https://softwareengineering.stackexchange.com/questions/344513/semantic-actioin-in-parser-combinators | |
return input => env.withPush({}, () => { | |
return children.reduce((acc, p, i) => args[i][0] == "{}" | |
? acc.overrideWith(p).userMap(r => /*[r]*/r) | |
: acc.flatMap(([acc,input]) => p(input).userMap(r => [...acc, r])), input.pass([], 0)) | |
}) | |
} | |
}[tag]() | |
} | |
p1 = parsify(or(stream('a:"a" b:"b"& c:(k: "k" "l")')).get()[0], [{}]) ; mustPass("a b k", p1) | |
log(p1.name) | |
if (false) {a = '"a" & "b" & "b2" | /c+/'; plog(a, or); | |
ap = parsify(or(stream(a)).get()[0], [{}]) | |
log(ap.name); ["abb2", "a bb2", "c"].forEach(input => mustPass(input, ap)) ; mustFail("abb 2", ap);} | |
/*pInfo["rep1sep"].parsified = rep1sep | |
rep1sepAComma = or(stream('rep1sep("a", ",")')).get()[0] ; log("ast:", rep1sepAComma); | |
rep1sepAComma = parsify(rep1sepAComma); mustPass("a,a,a", rep1sepAComma) | |
defined["int"] = () => int // convert simple parser into zero-arg parser | |
int = or(stream("/[0-9]+/")).get()[0] ; log(int); | |
int = parsify(int); log(int(stream("001")).get()[0]); | |
intRef = or(stream("int()")).get()[0] ; intRef = parsify(intRef); | |
log("used", used, used.size, Object.keys(defined).length) | |
log(intRef.name, intRef(stream("002")).get()[0]); | |
int = or(stream("a:/[0-9]+/ {parseInt(a)}")).get()[0] ; log(int); | |
int = parsify(int); log(int(stream("001")).get()[0]); | |
jp = parsify(or(stream('a:int b:int {console.log("Hello from grammar! Env="+JSON.stringify(env)); 1}')).get()[0], [{}]) ; log(jp(stream("12 3")).get()[0]) | |
js = 'a:int b:int {a+b}'; plog(js, or); jp = parsify(or(stream(js)).get()[0]); | |
log("a+b=",jp(stream("1 2")).get()[0]) | |
*/ | |
//log(parsify(parseRhs(`"aa" {"jj"} "bb" "cc"`).get()[0])(stream("aa bb cc")).get()[0].json()) | |
generate = (grammar) => { | |
// separate grammar by \n rule_identifier = ... separator. We will parse every rule separately | |
const offsets = /\n\s*[a-zA-Z][a-zA-Z0-9]*\s*(\(.*\))?\s*=/g.execAll(grammar).map(([match, at]) => match.index) | |
const unparsedRules = [...offsets, grammar.length].reduce( | |
([rules, start], end) => [[...rules, grammar.substring(start, end)], end+1] | |
, [[], 0])[0] | |
.map(stripLeadingComments) // convert into input stream, strippint leading comments | |
.filter(result => result.get()[1].len() != 0) // ignore empty (pure comment) rules | |
const parsedRules = unparsedRules.map(parseResult => parseResult.overrideWith(ruleParser)) | |
const [succeeded, failed] = parsedRules .partition(res => res.isSuccess()) | |
log("succeeded",succeeded.length,"rules of",parsedRules.length,":") | |
if (parsedRules.length != succeeded.length) | |
console.error(parsedRules.length - succeeded.length, "rules failed:"); | |
else {succeeded.map(res => res.get()[0]).forEach(([[name, args], rhs]) => { | |
log(name,args.isEmpty() ? "" : '(' + args.get()[1] + ')', "=", rhs.json()) | |
l2r[0][name] = parsify(rhs) | |
pInfo[name] = args.isEmpty() ? [] : args.get()[1] | |
}) | |
} ; failed.forEach((res, i) => log(i+1, res.json())) | |
} | |
//generate('integer = digits:/[0-9]+/ { return parseInt(digits);} | "abc" ') | |
generate('int = a:/[0-9]+/ { parseInt(a)} \n sum = a:int b:int {a+b} \n double(a) = a a \n ' | |
+'andThen(a, b)=a b \n x = "x" \n intRef = int \n intRef2 = intRef \n' | |
+ 'double2(c) = double(c)') | |
log(pInfo, pInfo.sum) | |
String.prototype.parse = function (pname, ...parsifiedChildren) { | |
const namedArgs = pInfo[pname].reduce((res, chName, i) => // pos to named | |
Object.assign(res, {[chName]: parsifiedChildren[i]}), {}) | |
l2r.withPush(namedArgs, () => plog(this, l2r[0][pname])) | |
} | |
"13 04".parse("double2", l2r.collapseAsEnv()["int"]) // the problem | |
//https://softwareengineering.stackexchange.com/questions/344920/evaluation-of-calls-with-parameters | |
throw 1 | |
"a a".parse("double", literal("a")) | |
"a b".parse("andThen", ...['a', 'b'].map(literal)) | |
"10".parse("int"); "11".parse("intRef"); "12".parse("intRef2"); | |
"01 02".parse("double", l2r.collapseAsEnv()["int"]) | |
// the problem here is that literal argument is ignored and [(), a] parsers | |
// will refer undefined | |
//plog('"a" "a"', defined['double'](literal("a"))) | |
//generate('integer = (p r) a "b" /[0-9]+/ | {a = b} ') | |
//generate('integer = l1:x l2:(inner: a {js1}) {js2}') | |
//generate('integer = f(x y) g & (k l)') | |
//generate('integer = f(x) | g (/y/ "z") ') | |
//generate('integer = l1:(ll1:a) l2:y self:(inner: a {js_inner}) {js_outer}') | |
//generate(grammar) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The key point of first version is in line | |
parAfterAmpersand = input => overrideP(literal("&"), labelledPrimary)(input) | |
.userMap(res => res[0] == "()"? res[1]: res) | |
used in and-parser | |
and = botOrFirst(labelledPrimary, rep1(oneOf(labelledPrimary, parAfterAmpersand)), (a,b) => ["&", a, ...b]) | |
It says that we must override & symbols with what they concatenate, so that text "a & b & c" becomes [&,a,b,c] in | |
parse tree because ampersands stand for default 'andThen' combinators and text "a & b & c" is equivalent to | |
"a b c". You do not write "and then" usually in BNF, you assume it if there is a space between child rule | |
identifiers/references. | |
The user map gets rid of parenthesis so that "a & (b | c)" and "a & (C) are parsed into [&, a, [+, b, c]] and | |
[&, a, c] instead of bulky [&, a, [(), [+, b, c]]] and [[&, a, [(), c]]]. But, the reason is not just unnecessary | |
parenthesis elimination. Because we skip &s in our sequence, we lose the difference between "a & (c)", which | |
stands for "a and then c" and "a(c)" which stands for "call a with argument c". In order to preserve semantis | |
while simplifying the expressions, I removed unnecessary parenthesis after and. They become unnecessary in | |
parsed AST because there is a single child node within parenthesis and we can refer it directly from the parent | |
and-then chain. | |
This however became problematic in presence of labels. Labels have a scope within and-then sequence. That is, | |
inner javascript may see all a,b and c labels but outer javascript should see only a an b labels | |
al:a & bl:b & (cl:c {inner javascript}) {outer javascript} | |
If however, we remove the parenthesis | |
al:a & bl:b & cl:c {inner javascript} {outer javascript} | |
we have got all labels in one scope. | |
There is another problem. If lables did not create them and there is a way to distringuish argument parenthesis | |
from the anded parenthesis, the translator still needs to sense next behind every identifier in order | |
to couple then into a single call. | |
So, one way to solve the label problem would be to evaluate the parenthesis at parse time. | |
Another solution would be to leave the parenthesis after eliminated & and change the "rule_id (args)" | |
call parser: instead of | |
rep1(oneOf(primary, optionalAndWithParenthesis) ) | |
we could leave the and = rep1(opt(&) ~> primary) and update the primary | |
`prim = literal | ... | identifier ~ pared(or) userMap(id, args => [")(", id, args]) | |
Yes, mangled ")(" instead of normal order "()" could denote a call. The advantage of this approach | |
is that rule reference is coupled with its arguments right by the parser so that translator will have | |
no hard time trying to match identifier with arguments. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment