Skip to content

Instantly share code, notes, and snippets.

@valtih1978
Last active September 2, 2018 13:06
Show Gist options
  • Save valtih1978/5aafe15e6fcc7c4a8933ae78116e4194 to your computer and use it in GitHub Desktop.
Save valtih1978/5aafe15e6fcc7c4a8933ae78116e4194 to your computer and use it in GitHub Desktop.
javascript-based parser
<script id="library" title="it must be a part of starndard javascript">
const log = console.log ; const assert = console.assert
update = (o, mutator) => {mutator(o); return o}
trace = (value, format) => {format = format || (data => data) ; log(format(value)); return value};
tracejs = (msg, data) => trace(data, data => [msg, data.json()])
Object.prototype.trace = function(format) {trace(this, format)}
function range (len, begin) {begin = begin || 0
return Array.apply(null, Array(len)).map((_, i) => {return begin+i;})}
Object.prototype.entries = function() {return Object.keys(this).map(k => [k, this[k]])};
//Array.prototype.fill = function(filler) {return this.map(_ => filler)} // startard func
Array.prototype.equals = function(that) {return this.every((my, i) => my == that[i] )} ; [1,2].equals([1,2])
Array.prototype.sum = function() {return this.reduce((acc, el) => acc + el,0)}
Array.prototype.product = function() {return this.reduce((acc, el) => acc * el,1)}
Object.prototype.json = function() {return JSON.stringify(this)}
Array.prototype.last = function() {return this[this.length-1]}
Array.prototype.mapField = function(key, f){ return this.map( o=>Object.assign({},o,{[key]:f(o.key)}) ) }
Array.prototype.sliceIndex = function(i) {return this.map(e => e[i])}
Array.prototype.mapIndex = function(i, f){ return this.map( child =>Object.assign([], child, {[i]:f(child[i])} ) )}
Array.prototype.equals = function(that) {return this.length == that.length && this.every((v,i)=> v === that[i])}
Array.prototype.zip = function(...those) {return this.map((my, i) => those.reduce((acc, that) => [...acc, that[i]],[my])) } //; [11,22,33].zip([1,2,3]).json()
String.prototype.toArray = function() {return range(this.length).map(i => this.charAt(i))}
Array.prototype.unzip = function() {return this[0].map((_, i) => this.map(sub => sub[i]))}
Map.prototype.getOrElse = function(key, value) { return this.has(key) ? this.get(key) : value}
Array.prototype.partition = function(filter) {return this.reduce(([passed, failed], el) => {(filter(el) ? passed : failed).push(el); return [passed, failed]}, [[],[]])}
Array.prototype.collapseAsEnv = function() { return this.reduce((res,scope) => Object.assign(res, scope), {})};
Map.prototype.getOrElse = function(key, value) { return this.has(key) ? this.get(key) : value}
Map.prototype.withDefaultValue = function(defaultValue) {
const result = new Map([...this.entries()]);
const getWas = result.get; result.get = (key) =>
result.has(key) ? getWas.call(this, key) : defaultValue;
return result
}
merge = (f, ...trees) => {
const [head, ...tail] = trees
return (typeof head == "object") ?
head.zip(...tail).map(trees => merge(f, ...trees)) : f(...trees)
} ; merge((a,b) => a+b , [1,2,3], [11,22,33])
RegExp.prototype.execAllGen = function*(input) {
for (let match; (match = this.exec(input)) !== null;)
yield [match, this.lastIndex];
} ; RegExp.prototype.execAll = function(input) {
return [...this.execAllGen(input)]};
//const matches = (/[a-z]+/g).execAll(" abc.xyz\n lkm") ; log("captured strings:", matches.map(m=>m[1]))
Array.prototype.flatten = function() {return this.reduce((acc, e) => [...acc, ...e],[])}
Array.prototype.flatMap = function(f) {return this.map(e => f(e)).flatten()};
//my sucks http://cs.stackexchange.com/questions/65962/anatomy-of-representing-of-cartesian-product-in-sw
// http://eddmann.com/posts/cartesian-product-in-javascript/
const cartesian = (...sets) => sets.reduce((acc, set) =>
acc.flatMap(x => set.map(y => [ ...x, y ])), [[]]);
const percent = p => Math.round(p * 100)
Array.prototype.isAscending = function(strict) {const greater = (a,b) => strict ? a > b : a >= b;
return this.length == 0 ? true :
this.reduce(([result, max], a) => [result && greater(a, max), a], [true, this[0]-1])[0]
} ;
//[false, true].forEach(strict => log([1,2,3,3].isAscending(strict), [1,2,3].isAscending(strict), [1,3, 2].isAscending(strict)))
var Some = function(value) { this.value = value;};
Some.prototype.get = function() {return this.value}
Some.prototype.map = function(f) {return new Some(f(this.value))}
Some.prototype.isEmpty = function() {return false}
var None = function() { };
None.prototype.get = function() {throw "attempt to get value from None"}
None.prototype.map = function(f) {return this}
None.prototype.isEmpty = function() {return true}
Try = function() {return {// children will create their class instance
makeSuccess: function(value) {return new Success(value)},
makeFailure: function(error) {return new Failure(error)},
}}
success = (value) => new Success(value);
var Success = function(value) {
this.success = value;};
Success.prototype = Object.assign(new Try(), {
isFailure: () => false, isSuccess: () => true,
map: function(f) {return this.makeSuccess(f(this.success));},
flatMap: function(f) {return f(this.success);},
getOrElse: function(value) {return this.success;},
get: function() {return this.success;},
recover: function(f) {return this},
})
fail = (error) => new Failure(error);
var Failure = function(error) { this.error = error}
Failure.prototype = Object.assign(new Try(), {
isSuccess: () => false, isFailure: () => true,
map: function(f) {return this},
flatMap: function(f) {return this},
getOrElse: function(value) {return value},
getError: function() {return this.error},
get: function() {throw this.error},
recover: function(f) {return f(this.error)},
makeSuccess: function(value) {
log("makeSuccess2", value)
return success(value)}, // children will create their class instance
})
</script>
!!!! I see that Eric Mayer discusses all three features that I wanted to address with my implementation
in one chapter "White-space, comments, and keywords" of http://www.cs.nott.ac.uk/~pszgmh/monparsing.pdf
I came up with the idea that oneOf | operator represtnts a plus + because BNF can be thougth of as a
Sum of Products. But, Eric points out that PEG is a limited version of CFG parser, which admits
ambiguities and, when alternatives bring the results, you add results together (all different ways
to parse the same string) thus, giving another reason to name it a plus operator.
!!!!!
// The basic idea of parser combinators is that A B | C D is identical to
A andThen B alternatively C andThen D.
And-Then is a default combinator. It binds stronger than alternatively, just like & is stronger than
| in logic A & B | C & D and * is stronger than arithmetical +. Now, you see that the andThen and alter
natively combinators _appear between_ the parsers. Here A B are references to other rules and there
is a parser per every rule in parser combinator. That rule is parsed with that parser. There are also
higher order parsers may appear, like opt(A), ep(A) or oneOf(parser1, parser2, parser3) -- the latter
is viable alternative to .alternatively combinator. They are higher order functions (no pun intended).
rather than parser combinators because parsers are functions that convert input stream into ParseResult,
which is a monad that encapsulates the parse result (a piece of AST produced by parser from the stream)
and the rest of the stream after parser has consumed a initial part of it. This allows to declare the
andThen and alternatively methods in the monad. AndThen will be invoked if parse result was successful
and alternatively will be invoked if it was not. Yes, ParseReult modand is and extension of Try monad
with ParseSuccess and ParseFailure subclasses.
ParseFailure's andThen method does nothing -- in returns the error it contains further, whithout
attempt to parse anything. It's alternatively method is alive though, it takes the input from the place
of failure and applies it to the argument parser, C in our example. This allows a recovery and successful
parse with alternative sequence of rules. ParseSuccess is opposite: its andThen takes the partially
consumed stream it contains, applies the parser and, in the case of success, when second parser produces
a result not failing, it creates a pair [this.result, next.result]. Reult can be updated in the ParseSuccess
monad with the standard map/flatMap methods. Because however the ParseSuccess extends the Success using
[astValue, restOfTheStream] as its success value and we want to update the astValue produced by parser
leaving the rest of the input stream intact, .userMap(astValue => ...user value update code...) method
is provided for convince.
All about text says that `A andThen B alternatively C andThen D` is technically incorrect. In fact,
the parser is
input => A(input) andThen B alternatively C andThen D
It is a funciton that produces A-monad once given the input stream, which where combinators
trigger and bind the rest of computations. It is however quite annoying to have a pattern
input => p1(input)...
all over the code. I should probably make combinators the methods of the Parser function
and forget about ParseResult, or duplicate the monadic methods in both ParseResult and parser.
// pass and fail: whenever we return a success, we eat something. Higer
order combinators do not eat the stream characters directly. They do not create the primary results --
they combine the lower order results.
// monadic parse results. Done:
// 1. realized that the top rule must be additive rather than multiplicative,
// 3. Whitepace stripping between the reules. Whenever stream is created (and, it is created on chars taken advance rather than mutated), we strip the leading spaces.
// 4. optional &-sign (as it is in math and BNF, yes sequence of nodes on the rgith hand is corresponds to & operation whereas alternative | is or-operation) skipping and rule references with arguments
// 5. parenthized sub-alternatives. This corresponds to the fact that we need parentheis
// if we want addition within a group to bind stronger than surrounding AND-s
// TODO:
// 1.Done literal calls to parametrized if parenthized expression is prefixed with rule identifier (without & symbol).
// 2.Done Simplify semantic actions. Currently, we have to map over [result, input] but parser users need to map only the result projection in most cases.
// 3. Parser generation (with keywords)
// 4. Random trees
// It is (going to be) invoked to build both parser generators and ranomd ranomd tree generators
//grammar = parse(grammar);
//grammar.recognizer().parse(input)
//grammar.generator().random()
Yes, it makes sense to do in javascript because tree generation is fast and we can send the programs for evalution
to Erlang, which is parallel and provides good isolation for proesses.
// In fact, "a b c d" stands for "a & b & c & d", which stands for "a & (b & (c & (d)))" because it is
// the same as a.andThen(b.andThen(c.andThen(d))). But, for the associative operations, we prefer the first form.
// With botOrFirst operator, I can write and parse the associative expressions like
// botOrFirst(multiplicative, "|".overrideWith(multiplicative) ) which gives me the list of addition operands
// in the plain form (+ a b c d ...) instead of ugly nest a+(b+(c+(d+...))). Moreover, this eliminates the need
// for late bindings necessary in "or = and + or | and" -- not or is defined throug or.
// if skip is on, input stream will use our parser and discard the results
// We won't use oneOf(whitespaces.asArray9().map(literal)) to allow parser use
// without dependencies and for performance reasons.
/*
a big coding disadvantage of my monadic approach is that I must type the input => parser(input)
input => firstParser(input).andThen(combinators).overrride(thatIwant)
everywhere. I was also lazy to implement andThen in the ParseResult monad, which is outght to be
extendsion of Try monad and placed then into Object.prototype. Alternative could try to attach
the monadic methods to the parsers. Parser.andTHen must initialize parser's 'next' field such that
when it gets a result, it makes the next action, passing its result as an argument.
*/
// DONE: stringification and even some parsification after parsnthesis are made mandatory to create
// label scopes. TODO: replace parenthesis with mandatory + nodes.
// TODO: stream(string) into string.toStream() since parser("abc".toStream()) is easier to read than parser(stream("abc"))
// Moreover, do "abc".parse(parser) instead of parser(stream("abc"))
// TODO: eliminate parenthesis nodes -- replace them with mandatory "+". The parenthesis are mandator now
// anyway for label scoping. Translation will start with the {} in the stack, which corresponds to the
// default +/& scope and every + will create it subscope stack frame as () did previously. `
// Let's use () nodes for referenced rule arguments since we do not need two kinds of parenthesis anymore!
tracejs = (msg, data) => trace(data, data => [msg, data.json()])
logRem = (msg, input) => log(msg + " from", input.rest())
plog = (string, parser) => {let n = parser.name; if (n == "") n = "anonym parser"
tracejs(n /*+ "(" + string + ") = "*/, parser(stream(string)))}
// I could extend the Object.prototype with ParseResult methods like .andThen but
// let's pay coding prise and the performance penalty for extending the ParseResults only
function ParseError(error) {
Object.assign(this, Failure.prototype); Failure.call(this, error)
this.makeSuccess = function(value) {return new ParseSuccess(value)} //teach Success to make ParseResults
};
function ParseSuccess(result) {
Object.assign(this, Success.prototype); Success.call(this, result);
this.makeSuccess = function(value) {return new ParseSuccess(value)} // set our constructor for Success
}
const ParseResult = ParseError.prototype = ParseSuccess.prototype = {};// monadic methds to be added
//success = result => new ParseSuccess(result) ; fail = (error) => new ParseError(error)
consumeWhile = (condition) => input => {let offset = 0; for (; offset < input.len() &&
condition(input.charAt(offset)) ; offset++) ; return input.pass(input.rest(offset), offset)};
consumeFollowingChars = (chars) => consumeWhile(char => chars.indexOf(char) != -1)
skip20TabCrNl = consumeFollowingChars(" \t\r\n")
stream = (string, options) => {
const result = Object.assign({source: string, offset:0, skip:skip20TabCrNl,
peek: function() {return string.charAt(this.offset)},
charAt: function(at) {return string[this.offset + at]},
fail: function(msg) {return new ParseError({at: this.offset, msg: msg})},
len: function() {return string.length - this.offset},
rest: function(len) {return len == undefined
? string.substring(this.offset)
: string.substring(this.offset, len + this.offset)},
clone: function(offset, options) {
return stream(string, Object.assign({
offset:this.offset+offset, skip:this.skip}, options))},
eat: function(nChars) {return nChars == 0 ? this : this.clone(nChars)},
pass: function(result, consumed) {return new ParseSuccess([result, this.eat(consumed)])}
}, options);
return result.skip(result).get()[1]
}
checkParseResult = (inversion, input, parser, expected) => {
let [name, parseResult] = parser.isSuccess != undefined ?
[input, parser] : [parser.name, parser((typeof input == "string") ? stream(input) : input)]
parseResult = parseResult.flatMap(([presult,rem]) =>
expected != undefined && expected.ast != undefined && presult.json() != expected.ast.json()
? fail("parsed result "+presult.json() +" whereas "+expected.ast.json() + " expected")
: success([presult, rem])).flatMap(([pres,rem]) =>
expected != undefined && expected.rem != undefined && expected.rem != rem.len()
? fail("There are " + rem.len() + " chars remains in the stream wheras we wanted " + expected.rem)
: success([pres, rem]) )
//const spec = parser //[parser.name, "(\"" + input + "\")"].json();
const result = inversion(name, parseResult)
if (result.isFailure()) console.error(result.getError()) ; return result
} ;
mustFail = (...checkArgs) => { return checkParseResult((parser, checked) => checked.isFailure()
? success([parser, "detected user input error", checked.getError().json(), "as expected"])
: fail(parser + " had to fail, yet, produced Success(" + checked.get().json() + ")")
, ...checkArgs);
}
mustPass = (...checkArgs) => { return checkParseResult((parser, checked) => checked.isSuccess()
? success({parser:parser, parseResult:checked.get().json()})
: fail({parser:parser, err:checked.getError().json()}), ...checkArgs);
}
mustPass(" \ta", input => input.pass(input.peek(), 1), {ast: "a"}) // ensure that sw stripped and a consumed
mustFail(" \ta", input => input.fail("whatever")) // input and parser are send to checker
mustPass("", stream("abc").pass(1, 0), {ast:1}) // parser name and parse result are send to checker
// show that we consume whitespaces from as stream that does not do that for us
keepWhitespaces = (string) => stream(string, {skip:consumeFollowingChars("")})
successParser = (result) => input => input.pass(result, 0)
mustPass(keepWhitespaces(" \ta"), successParser(), {rem:3}) // check that nothing was consumed
mustPass(keepWhitespaces(" \ta"), consumeFollowingChars(" \t\n\r"), {rem:1}) // a is left in stream
if (false) {const abc = stream("abc"); const bc = [abc, abc.eat(1)]
log(bc.map(s => [s.rest(), s.peek()]))}
named = (name, f) => update(f, f => {Object
.defineProperty(f, "name", {writable: true}); f.name = name})
literal = literal => named("literal(" + literal + ")", (input) => {
const match = input.rest(literal.length);
return match == literal ? input.pass(match, literal.length)
: input.fail("expected "+literal + " whereas got " + match)
});
mustPass("abc", literal("ab")) ; mustFail("aab", literal("ab"))
weUseRe = re => named(re, input => {const match = input.rest().match(re);
return match != null && match.index == 0
? input.pass(match[0], match[0].length)
: input.fail("expected pattern " + re + " whereas got " + input.rest(20)) });
mustPass("abc", weUseRe(/ab/), {rem:1})
ParseResult.userMap = function(f) {return this.map(([res, input]) => [f(res), input])}
ParseResult.userFlatMap = function(f) {return this.flatMap(([res, input]) => f(res))}
oneOf = (...parsers) => {
const specializedName = "oneOf " + parsers.map(p => p.name)
return named(specializedName, input => parsers.reduce((failed, parser) =>
failed.recover(_ => parser(input)), input.fail("oneOf seed with alternatives " + specializedName))
.recover(_ => input.fail(specializedName + " expected whereas got " + input.rest(20))))
}
ParseResult.andThen = function(...parsers) { return this.flatMap(([myResult, input]) =>
oneOf(...parsers)(input).userMap(hisResult => [myResult, hisResult]))}
ParseResult.andThen = function(another) { return this.flatMap(([myResult, input]) =>
another(input).userMap(hisResult => [myResult, hisResult]))}
mustPass("a & ab1", input => literal("a")(input).andThen(literal("&")).andThen(oneOf(literal("c"), literal("a"))))
ParseResult.orElse = function(thatResult) {return this.recover(_ => thatResult)}
mustPass("a&ab", i => literal("b")(i).orElse(successParser("alt")(i)), {ast:"alt"})
ParseResult.overrideWith = function(parser) {return this.flatMap(([_, input]) => parser(input))}
overrideP = (skipedParser, replacement) => input => skipedParser(input).overrideWith(replacement)
mustPass("&c", input => literal("&")(input).overrideWith(literal("c")), {ast:"c"})
mustPass("&c", overrideP(literal("&"), literal("c")), {ast:"c"})
ParseResult.skipNext = function(parser) {return this.flatMap(([my, input]) => parser(input).userMap(_ => my))}
enclosed = (...args) => { // args either (left, content, right) or ("lr", content) where lr is something like []
const [lp, content, rp] = args.length == 2 ? [args[0][0], args[1], args[0][1]] : args
return named(`${lp}${content.name}${rp}`, input => literal(lp)(input).overrideWith(content)
.skipNext(literal(rp)).userMap(content => /*({class: lp+rp, value:content})*/[lp+rp, content]))
}
pared = content => enclosed("()", content) ; mustPass("(abc)", pared(literal("abc")))
// undefined spec captures any literal, specified caputres specific one
const QQ = "\"\"";
stringLiteral = (spec) => enclosed(QQ, spec == undefined ? weUseRe(/[^"]*/) : literal(spec))
mustPass('"abc"', stringLiteral("abc"), {ast:[QQ, "abc"]}); mustPass('"abc"', stringLiteral());
identifierParser = weUseRe(/[a-zA-Z][a-zA-Z0-9]*/)
theyUseRe = enclosed("//", weUseRe("[^/]+")) // plog("/ab[c]+/", theyUseRe)
ujsBrackets = new Map([["}", -1], ["{", +1]]).withDefaultValue(0)
userJsAction = enclosed("{}", input => (() => {
const loop = (eat, deficit) => deficit == 0 ? input.pass(input.rest(eat-1), eat-1) :
eat == input.len() ? input.fail("could not find java end") :
loop(eat + 1, deficit + ujsBrackets.get(input.charAt(eat)))
return loop(0, 1)})())
lb = name => named("lb_" + name, (...args) => eval(name)(...args)) // late binding
bothOrFirst = (a, b, map) => named(a.name + " or " + a.name + " ~ " + b.name,
input => {const A = a(input); return A.andThen(b).userMap(ab => map(...ab)).orElse(A)})
//const args = bothOrFirst(lb("or"), input => literal(",")(input).overrideWith(args),
//(a,b) => [",", a].concat(b[0] == "," ? b.slice(1) : b))
const args = bothOrFirst(input => or(input).userMap(arg => [arg]),
input => literal(",")(input).overrideWith(args), (a,b) => [...a,...b])
primary = named("primary",
oneOf(stringLiteral(),
bothOrFirst(identifierParser, pared(args), (rule,[par,args]) => ["()", rule, ...args]),
theyUseRe, userJsAction
, input => pared(or)(input).userMap(([par, terms])=> terms) ))
mustPass("abc", primary)
ParseResult.binMap = function(f) {return this.userMap(([a,b]) => f(a,b))}
bin = (name, binMap, a, ...b) => named(name, input => { const seed = a(input)
return seed.andThen(...b).binMap(binMap).orElse(seed)})
// Actually, I could define bin "withOption" combinator, like
ParseResult.withOption = function(...parsers) {
return this.andThen(...parsers).orElse(this)}
unfortunateBin = (op, a, ...b) => input => a(input).withOption(...b)
.binMap(binSignMap(op)) // here is the problem
// But, I would not be able to map the seed-option paris as I did in the bin in this case
opt = (bodyP) => named("optional " + bodyP.name, input => (bodyP)(input)
.userMap(result => new Some(result)).orElse(successParser(new None())(input)))
mustPass("11", opt(literal("1")), {rem: 1}); mustPass("11", opt(literal("2")), {rem: 2})
labelledPrimary = named("labeledPrimary",
input => opt(input => identifierParser(input).skipNext(literal(":")))(input).andThen(primary)
.userMap(([label, primary]) => label.isEmpty() ? primary : [":", label.get(), primary]))
mustPass("aa", labelledPrimary, {ast:"aa"});
mustPass("a2:aa", labelledPrimary, {ast:[":","a2","aa"]});
//mustPass("a:(aa)", labelledPrimary, {ast:[":","a",["()", "aa"]]}); // or is not defined for that
binSignMap = (sign) => (a,b) => "(" + a + sign + b + ")"
// after &, parenthesis are removed in AST to make difference with function calls
// This has nice side effect that a & (b c) is converted into &(a,b,c) but
// downside of this is that al:a & (bl:b) is converd into al:a & bl:b and we have label scope mixing error
parAfterAmpersand = input => overrideP(literal("&"), labelledPrimary)(input)
.userMap(res => res[0] == "()"? res[1]: res)
// Our second version of and/or rules. Here, we flatten the nest of & binary subexpressions
// because we do not have the rep
and = bin("and",
// binSignMap("*")
//(a,b) => "(" + a + "&" + b + ")"
//(a,b) => a + "&" + b
//(a,b) => ({class:"&", a:a, b:b})
//(a,b) => ["&", a, b]
(...args) => args.reduce((acc, a) => a[0] == "&" ? [...acc, ...a.slice(1)] : [...acc, a], ["&"])
//(...args) => and(...args) below is a late binding!
, primary,
lb("and"), parAfterAmpersand)
or = bin("or",
//binSignMap("+")
(...args) => args.reduce((acc, a) => a[0] == "+" ? [...acc, ...a.slice(1)] : [...acc, a], ["+"])
, and
, overrideP(literal("|"), lb("or")))
if (false) plog("x y z (k l | m t | p | f) a b c)", and)
// after &, parenthesis are removed in AST to make difference with function calls
if (false) plog("(a| m) a c & (a | l | t) | a c & d (a r g) (r | m) ", or)
flatTail = ([a,b]) => [a, ...b]
//kneele star, rep(a) = a ~ rep(a) | ε
rep = (bodyParser) => named("rep of " + bodyParser.name, input => oneOf(
input => (bodyParser)(input).andThen(rep(bodyParser))
.userMap(flatTail), successParser([]))(input)) // there will be a failure in the end and we cure it with empty array
//kneele plus, rep1(a) = a ~ rep(a)
rep1 = (bodyParser) => named("rep1 of " + bodyParser.name,
input => bodyParser(input).andThen(rep(bodyParser)).userMap(flatTail))
const [la,comma] = ["a", ","].map(literal)
;["aaa", ""].map(input => mustPass(input, rep(la)));
mustPass("aa", rep1(la)); mustFail("baa", rep1(la));
// rep1sep(a, sep) = a ~ rep(sep ~> a)
// repsep(a, sep) = rep1(sep ~> a) | ε
rep1sep = (item, sep) => named("rep_" + item.name + "_sep_" + sep.name,
bothOrFirst(item, rep(input => sep(input).overrideWith(item)), (a,b)=> [a, ...b]))
repsep = (item, sep) => named("rep1_" + item.name + "_sep_" + sep.name,
oneOf(rep1sep(item, sep), successParser([])))
if(true) ["a,a,a", ""].map(input => mustPass(input, repsep(la, comma)) );
mustPass("a,a,a", rep1sep(la, comma)); mustFail("", rep1sep(la, comma));
// 3rd version of and/or parsers which flattens the binary nest automatically, by virtue of rep parser
//and = botOrFirst(labelledPrimary, rep1(labelledPrimary, parAfterAmpersand)), (a,b) => ["&", a, ...b])
//and = input => rep1sep(labelledPrimary, opt(literal("&")))(input).userMap(items => ["&", ...items]) // "b" => [&,b]
and = named("and", input => {const anded = bothOrFirst(
labelledPrimary, rep1(input => opt(literal("&"))(input).overrideWith(labelledPrimary)),
(pr,rep) => ["&", pr, ...rep])(input)
return anded.userMap(literals => {//const strip = literals[0] == ":";
// if(strip) console.warn("label", literals[1], "makes no sense -- it will be stripped from ast");
// return strip ? literals[2] : literals})})
return literals[0] == ":" ? ['&', literals] : literals})})
or = named("or", bothOrFirst(and, rep1(overrideP(literal("|"), and)), (seed,tail) => ["+", seed, ...tail]))
//plog("a (c) b1:b & (c d) (kk:k|l)", or)
// check the parse pass and fail themselves
if (false) { mustPass("aa", primary, {ast:"aa", rem:0}).json()
log(1, [mustFail("&", primary), mustFail("a", primary),
mustFail("a", primary, ["ab",2])].map(r => r.json() + "\n")); mustPass("a", primary).json()}
testReport = (testResults) => {const failed = testResults.filter(tr => tr.isFailure());
const successCount = testResults.length - failed.length
if (successCount == testResults.length) log("all " + testResults.length + " tests succeeded");
else log("only", successCount, "tests succeded whereas", failed.length, "failed");
} ; testReport([mustFail("&", primary), mustPass("a", primary), mustPass("a", primary, {ast:"a", rem:0}),
mustPass("x (r)", or, {ast:["()","x","r"]}), mustPass("x (y (k,l))", or, {ast:["()","x",["()", "y", "k", "l"]]}),
mustPass("(/a/| m) c | a1 c & dd:d (mm:m | l n) \"a\" bb:(b b)", or, {ast:["+",["&",["+",["//", "a"],"m"],"c"],
["&","a1","c",[":","dd",
["()","d",["+",["&", [":", "mm", "m"]],["&","l","n"]]]],
[QQ, "a"],[":","bb",["&","b","b"]]]], rem:0})])
eos = input => input.len() == 0 ? input.pass("eos", 0) : input.fail("eos expected but " + input.rest(10) + " found")
ParseResult.finished = function() {return this.skipNext(eos)}
mustPass("", eos) ; mustFail("a", eos) ; mustPass("a", input => (literal("a")(input)).finished())
// The high-level architecture
// We parse the grammar. It consists of the rules like
// name (optional args) = rhs, separated by \n. Our parsifier creates a parer from the rhs AST.
// This parser may refer other rules/parsers. When this parser is asked to parse user iniput,
// it will pass the other parser - arguments and call the referenced parser.
grammar = `
// after math a b c x + x y z = +(·(a,·(b,·(c,d)), ·(x,·(y,z)))
// we can type y = a b c | k l m, abbreviating the fully-qualified syntax y = a & b & c | k & l & m
// but we cannot ommit the & in case of parethesis, y = a & (b | c) since a(b | c) will call rule a with arguments
start
= additive
additive
= left:multiplicative "+" right:additive { return left + right; }
| multiplicative
multiplicative
= left:primary "*" right:multiplicative { return left * right; }
| primary
primary
= integer
| "(" additive:additive ")" { return additive; }
integer
= digits:/[0-9]+/ { return parseInt(digits.join(""), 10); }`
comments = input => skip20TabCrNl(input).overrideWith(input =>
(input.len() < 2 || input.charAt(0) != '/' || input.charAt(1) != '/') ?
input.pass("", 0) : consumeWhile(c => c != '\n')(input).andThen(comments)
)
mustPass("// this is a comment\n aaa", comments, {rem:3, ast:["// this is a comment", ""]})
mustPass(stream(" ///behind two \n // lines 2 \n x", {skip:comments}), literal("x"))
mustPass(stream(" ///behind two \n // lines 2 \n ", {skip:comments}), eos)
stripLeadingComments = (string) => comments(stream(string/*, {skip:comments}*/))
mustPass("stripping comments", stripLeadingComments( // we an get comments if they are not skipped
" //1st \n //2nd line "), {rem:0, ast:["//1st ",["//2nd line ", ""]]})
ruleParser = input => identifierParser(input).andThen(opt(pared(rep1sep(identifierParser, literal(","))))).skipNext(literal("=")).andThen(or).finished()
mustPass(stream("a=b // c", {skip:comments}), ruleParser)
// todo: we do not need scopeChain in the stringify. It is for parsify where we will execute javascript
const stringify = (rhs) => {
const stringifyAComposite = () => { const [tag, ...args] = rhs;
const join = separator => args.map(stringify).join(separator)
return ["{}", QQ, "//"].indexOf(tag) != -1 ? tag[0] + args[0] + tag[1] : {
"()": () => args[0] + "(" + stringify(args[1]) + ")",
":": () => args[0] + ":" + stringify(args[1]),
"+": () => join("|"), "&": () => join("&") ,
}[tag]()} ; return typeof rhs == "string" ? rhs : stringifyAComposite()
}
a = "la: aa lb:(lc:{js1}) f(xx:x|y) {js2}"; plog(a, or); log(stringify(or(stream(a)).get()[0], [{}]))
// ToDo: in addition to stringify, make rnd and parsify functions. We could use random
// trees stringified to test our generated parsers. We will need randexp!
backToTextCheck = (right) => { const unparsedRule = "integer = " + right
const parsed = stripLeadingComments(unparsedRule).overrideWith(ruleParser)
const noWS = str => str .replace(/\s*/mg, "")
mustPass(unparsedRule + " stringification test", parsed.userMap(
([left, right]) => left + " = " + stringify(right)).userMap(noWS)
.flatMap(([stringified, input]) => stringified == noWS(unparsedRule) ?
input.pass(stringified,0): input.fail(stringified + " != "+ noWS(unparsedRule))) )
}
;['b:a', 'b:/a/ & "c" ', 'k & l', 'k|l', 'f(x) | l1:g (l2:/y/ & "z") & {js1}'].forEach(backToTextCheck)
const parseRhs = (grammar) => stripLeadingComments(grammar).overrideWith(or).finished()
const rhsCheck = (kind, grammar, input) => kind("applying grammar " + grammar + " to " + input,
parseRhs(grammar).userFlatMap((ast) => parsify(ast, new Set())(stream(input))))
const rhsPass = (...args) => rhsCheck(mustPass, ...args)
const rhsFail = (...args) => rhsCheck(mustFail, ...args)
//rhsFail(`"a" "a" "c"`, "aab")
//defined = [];
pInfo = {}, l2r = [{}], used = new Set(); env = [{}]
// Stringify, parsify or both https://softwareengineering.stackexchange.com/questions/336243
parsify = (rhs) => {
if (typeof rhs == "string") rhs = ["()", rhs];
const [tag, ...args] = rhs;
const join = separator => args.map(parsify).map(p => p.name).join(separator)
return {
[QQ]: () => named('"' +args[0] + '"', literal(args[0])),
"//": () => named('/' +args[0] + '/', weUseRe(args[0])),
"{}": () => named('{' +args[0] + '}', input => {const myEnv = env.collapseAsEnv()
return input.pass(eval("with (myEnv)" + args[0]), 0)}),
":": () => {const parser = parsify(args[1]); return named (args[0]+":"+parser.name,
input => parser(input).userMap(res => env.last()[args[0]] = res))},
"()": () => {const f = args.shift(); used.add(f);
const pa = args.map(parsify) ;
return named(f + "("+pa.map(a => a.name).join(",")+")",
//defined[f](...pa) // immediate is easy and fast but won't handle recursions
//input => defined[f](...pa)(input) // so, resolve refs at parser use time.
input => {; const spec = f + "("+args.join(",") + ")"
const ref = l2r.collapseAsEnv()[f]
log("defined", spec, l2r.collapseAsEnv())
if (ref == undefined) throw spec +" is undefined"
const namedArgs = pInfo[f].reduce((res, chName, i) => // pos to named
Object.assign(res, {[chName]: pa[i]}), {})
log("named args", namedArgs)
return l2r.withPush(namedArgs, () => ref(input))
return ref(input)}
)},
"+": () => named(join("|"), oneOf(...args.map(parsify))),
"&": () => {const children = args.map(parsify)
// Here is the issue
// https://softwareengineering.stackexchange.com/questions/344513/semantic-actioin-in-parser-combinators
return input => env.withPush({}, () => {
return children.reduce((acc, p, i) => args[i][0] == "{}"
? acc.overrideWith(p).userMap(r => /*[r]*/r)
: acc.flatMap(([acc,input]) => p(input).userMap(r => [...acc, r])), input.pass([], 0))
})
}
}[tag]()
}
p1 = parsify(or(stream('a:"a" b:"b"& c:(k: "k" "l")')).get()[0], [{}]) ; mustPass("a b k", p1)
log(p1.name)
if (false) {a = '"a" & "b" & "b2" | /c+/'; plog(a, or);
ap = parsify(or(stream(a)).get()[0], [{}])
log(ap.name); ["abb2", "a bb2", "c"].forEach(input => mustPass(input, ap)) ; mustFail("abb 2", ap);}
/*pInfo["rep1sep"].parsified = rep1sep
rep1sepAComma = or(stream('rep1sep("a", ",")')).get()[0] ; log("ast:", rep1sepAComma);
rep1sepAComma = parsify(rep1sepAComma); mustPass("a,a,a", rep1sepAComma)
defined["int"] = () => int // convert simple parser into zero-arg parser
int = or(stream("/[0-9]+/")).get()[0] ; log(int);
int = parsify(int); log(int(stream("001")).get()[0]);
intRef = or(stream("int()")).get()[0] ; intRef = parsify(intRef);
log("used", used, used.size, Object.keys(defined).length)
log(intRef.name, intRef(stream("002")).get()[0]);
int = or(stream("a:/[0-9]+/ {parseInt(a)}")).get()[0] ; log(int);
int = parsify(int); log(int(stream("001")).get()[0]);
jp = parsify(or(stream('a:int b:int {console.log("Hello from grammar! Env="+JSON.stringify(env)); 1}')).get()[0], [{}]) ; log(jp(stream("12 3")).get()[0])
js = 'a:int b:int {a+b}'; plog(js, or); jp = parsify(or(stream(js)).get()[0]);
log("a+b=",jp(stream("1 2")).get()[0])
*/
//log(parsify(parseRhs(`"aa" {"jj"} "bb" "cc"`).get()[0])(stream("aa bb cc")).get()[0].json())
generate = (grammar) => {
// separate grammar by \n rule_identifier = ... separator. We will parse every rule separately
const offsets = /\n\s*[a-zA-Z][a-zA-Z0-9]*\s*(\(.*\))?\s*=/g.execAll(grammar).map(([match, at]) => match.index)
const unparsedRules = [...offsets, grammar.length].reduce(
([rules, start], end) => [[...rules, grammar.substring(start, end)], end+1]
, [[], 0])[0]
.map(stripLeadingComments) // convert into input stream, strippint leading comments
.filter(result => result.get()[1].len() != 0) // ignore empty (pure comment) rules
const parsedRules = unparsedRules.map(parseResult => parseResult.overrideWith(ruleParser))
const [succeeded, failed] = parsedRules .partition(res => res.isSuccess())
log("succeeded",succeeded.length,"rules of",parsedRules.length,":")
if (parsedRules.length != succeeded.length)
console.error(parsedRules.length - succeeded.length, "rules failed:");
else {succeeded.map(res => res.get()[0]).forEach(([[name, args], rhs]) => {
log(name,args.isEmpty() ? "" : '(' + args.get()[1] + ')', "=", rhs.json())
l2r[0][name] = parsify(rhs)
pInfo[name] = args.isEmpty() ? [] : args.get()[1]
})
} ; failed.forEach((res, i) => log(i+1, res.json()))
}
//generate('integer = digits:/[0-9]+/ { return parseInt(digits);} | "abc" ')
generate('int = a:/[0-9]+/ { parseInt(a)} \n sum = a:int b:int {a+b} \n double(a) = a a \n '
+'andThen(a, b)=a b \n x = "x" \n intRef = int \n intRef2 = intRef \n'
+ 'double2(c) = double(c)')
log(pInfo, pInfo.sum)
String.prototype.parse = function (pname, ...parsifiedChildren) {
const namedArgs = pInfo[pname].reduce((res, chName, i) => // pos to named
Object.assign(res, {[chName]: parsifiedChildren[i]}), {})
l2r.withPush(namedArgs, () => plog(this, l2r[0][pname]))
}
"13 04".parse("double2", l2r.collapseAsEnv()["int"]) // the problem
//https://softwareengineering.stackexchange.com/questions/344920/evaluation-of-calls-with-parameters
throw 1
"a a".parse("double", literal("a"))
"a b".parse("andThen", ...['a', 'b'].map(literal))
"10".parse("int"); "11".parse("intRef"); "12".parse("intRef2");
"01 02".parse("double", l2r.collapseAsEnv()["int"])
// the problem here is that literal argument is ignored and [(), a] parsers
// will refer undefined
//plog('"a" "a"', defined['double'](literal("a")))
//generate('integer = (p r) a "b" /[0-9]+/ | {a = b} ')
//generate('integer = l1:x l2:(inner: a {js1}) {js2}')
//generate('integer = f(x y) g & (k l)')
//generate('integer = f(x) | g (/y/ "z") ')
//generate('integer = l1:(ll1:a) l2:y self:(inner: a {js_inner}) {js_outer}')
//generate(grammar)
The key point of first version is in line
parAfterAmpersand = input => overrideP(literal("&"), labelledPrimary)(input)
.userMap(res => res[0] == "()"? res[1]: res)
used in and-parser
and = botOrFirst(labelledPrimary, rep1(oneOf(labelledPrimary, parAfterAmpersand)), (a,b) => ["&", a, ...b])
It says that we must override & symbols with what they concatenate, so that text "a & b & c" becomes [&,a,b,c] in
parse tree because ampersands stand for default 'andThen' combinators and text "a & b & c" is equivalent to
"a b c". You do not write "and then" usually in BNF, you assume it if there is a space between child rule
identifiers/references.
The user map gets rid of parenthesis so that "a & (b | c)" and "a & (C) are parsed into [&, a, [+, b, c]] and
[&, a, c] instead of bulky [&, a, [(), [+, b, c]]] and [[&, a, [(), c]]]. But, the reason is not just unnecessary
parenthesis elimination. Because we skip &s in our sequence, we lose the difference between "a & (c)", which
stands for "a and then c" and "a(c)" which stands for "call a with argument c". In order to preserve semantis
while simplifying the expressions, I removed unnecessary parenthesis after and. They become unnecessary in
parsed AST because there is a single child node within parenthesis and we can refer it directly from the parent
and-then chain.
This however became problematic in presence of labels. Labels have a scope within and-then sequence. That is,
inner javascript may see all a,b and c labels but outer javascript should see only a an b labels
al:a & bl:b & (cl:c {inner javascript}) {outer javascript}
If however, we remove the parenthesis
al:a & bl:b & cl:c {inner javascript} {outer javascript}
we have got all labels in one scope.
There is another problem. If lables did not create them and there is a way to distringuish argument parenthesis
from the anded parenthesis, the translator still needs to sense next behind every identifier in order
to couple then into a single call.
So, one way to solve the label problem would be to evaluate the parenthesis at parse time.
Another solution would be to leave the parenthesis after eliminated & and change the "rule_id (args)"
call parser: instead of
rep1(oneOf(primary, optionalAndWithParenthesis) )
we could leave the and = rep1(opt(&) ~> primary) and update the primary
`prim = literal | ... | identifier ~ pared(or) userMap(id, args => [")(", id, args])
Yes, mangled ")(" instead of normal order "()" could denote a call. The advantage of this approach
is that rule reference is coupled with its arguments right by the parser so that translator will have
no hard time trying to match identifier with arguments.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment