Last active
October 8, 2019 17:54
-
-
Save jorendorff/403e1ea0c3c42dad730b3be8b6dc8cf0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""generate_js_parser_tables.py - Generate tables from the ES grammar.""" | |
import argparse | |
import os | |
import jsparagus.gen | |
import jsparagus.grammar | |
from .parse_esgrammar import parse_esgrammar | |
from .lexer import ECMASCRIPT_FULL_KEYWORDS, ECMASCRIPT_CONDITIONAL_KEYWORDS | |
ECMASCRIPT_GOAL_NTS = [ | |
'Script', | |
'Module', | |
# 'FormalParameters', | |
# 'FunctionBody', | |
] | |
ECMASCRIPT_SYNTHETIC_TERMINALS = { | |
'IdentifierName': { | |
'Name', | |
*ECMASCRIPT_FULL_KEYWORDS, | |
*ECMASCRIPT_CONDITIONAL_KEYWORDS | |
}, | |
'Identifier': { | |
'Name', | |
*ECMASCRIPT_CONDITIONAL_KEYWORDS | |
} | |
} | |
def hack_grammar(g): | |
# We throw away most of the boolean parameters in the grammar, as the | |
# current parser generator's approach of fully expanding them is a huge | |
# pain. | |
PARAM_WHITELIST = ['In', 'Default'] | |
def filter_params(params): | |
return tuple(p for p in params if p in PARAM_WHITELIST) | |
def filter_args(args): | |
return tuple(pair for pair in args if pair[0] in PARAM_WHITELIST) | |
def filter_element(e): | |
""" Strip nt arguments. """ | |
if isinstance(e, jsparagus.grammar.Nt): | |
return jsparagus.grammar.Nt(e.name, filter_args(e.args)) | |
elif isinstance(e, jsparagus.grammar.Optional): | |
return jsparagus.grammar.Optional(filter_element(e.inner)) | |
else: | |
return e | |
def filter_condition(c): | |
if c is None or c[0] not in PARAM_WHITELIST: | |
return None | |
return c | |
def filter_production(p): | |
""" Discard production conditions and nt arguments. """ | |
body = [filter_element(e) for e in p.body] | |
return jsparagus.grammar.Production(body, p.reducer, | |
condition=filter_condition(p.condition)) | |
nonterminals = {} | |
for nt, nt_def in g.nonterminals.items(): | |
params = list(filter_params(nt_def.params)) | |
rhs_list = [filter_production(p) for p in nt_def.rhs_list] | |
nonterminals[nt] = jsparagus.grammar.NtDef(params, rhs_list, nt_def.type) | |
return g.with_nonterminals(nonterminals) | |
def investigate2(states): | |
from collections import defaultdict | |
grammar = states.grammar | |
back_edges = defaultdict(lambda: defaultdict(set)) | |
for state in states.states: | |
for t, action in state.action_row.items(): | |
if action >= 0: # shift | |
back_edges[action][t].add(state.id) | |
for nt, next_state in state.ctn_row.items(): | |
back_edges[next_state][nt].add(state.id) | |
def all_back_edges(ids): | |
result = defaultdict(set) | |
for i in ids: | |
for symbol, js in back_edges[i].items(): | |
result[symbol] |= js | |
return result | |
def report(seq, what): | |
print(seq, "=>", what) | |
def distinguish(a, b, a0, b0, postfix): | |
"""See what lookbehind distinguishes two sets of states.""" | |
skip_set = set() | |
if postfix: | |
for name, target_set in [('div', a0), ('re', b0)]: | |
if a & target_set and not (b & target_set): | |
yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "div") | |
skip_set.add((name, "div")) | |
if b & target_set and not (a & target_set): | |
yield("<{} context> {}".format(name, grammar.rhs_to_str(postfix)), "re") | |
skip_set.add((name, "re")) | |
za = all_back_edges(a) | |
zb = all_back_edges(b) | |
zau = set(za.keys()) - set(zb.keys()) | |
zbu = set(zb.keys()) - set(za.keys()) | |
if len(zau) > len(zbu): | |
others = 'div' | |
for symbol in zau: | |
del za[symbol] | |
elif len(zbu) > 1: | |
others = 're' | |
for symbol in zbu: | |
del zb[symbol] | |
else: | |
others = None | |
for symbol in sorted(za, key=repr): | |
if symbol not in zb: | |
pre = za[symbol] | |
if pre & a0 and not (pre & b0) and ('div', 'div') in skip_set: | |
pass | |
elif pre & b0 and not (pre & a0) and ('re', 'div') in skip_set: | |
pass | |
else: | |
yield grammar.rhs_to_str([symbol] + postfix), "div" | |
for symbol in sorted(zb, key=repr): | |
if symbol not in za: | |
pre = zb[symbol] | |
if pre & a0 and not (pre & b0) and ('div', 're') in skip_set: | |
pass | |
elif pre & b0 and not (pre & a0) and ('re', 're') in skip_set: | |
pass | |
else: | |
yield grammar.rhs_to_str([symbol] + postfix), "re" | |
else: | |
yield from distinguish(za[symbol], zb[symbol], a0, b0, [symbol] + postfix) | |
if others is not None: | |
yield "<other> " + grammar.rhs_to_str(postfix), others | |
div_ss = {s.id for s in states.states if '?' in s.action_row or '/=' in s.action_row} | |
re_ss = {s.id for s in states.states if 'RegularExpressionLiteral' in s.action_row} | |
assert len(div_ss & re_ss) <= 4 | |
div_ss -= re_ss | |
for i, o in distinguish(div_ss, re_ss, div_ss, re_ss, []): | |
report(i, o) | |
def investigate(states): | |
from jsparagus import grammar | |
import collections | |
import walk_graph | |
# Figure out which nonterminals can match the empty string. | |
empty_nts = {} | |
implications = [] | |
for nt, nt_def in states.grammar.nonterminals.items(): | |
for prod in nt_def.rhs_list: | |
rhs = [e for e in prod.body if grammar.is_concrete_element(e)] | |
if len(rhs) == 0: | |
empty_nts[nt] = True | |
elif all(isinstance(e, grammar.Nt) for e in rhs): | |
implications.append((rhs, nt)) | |
done = False | |
while not done: | |
done = True | |
for rhs, nt in implications: | |
if nt not in empty_nts and all(empty_nts.get(e) for e in rhs): | |
done = False | |
empty_nts[nt] = True | |
def strip_body(body): | |
return [e for e in body if grammar.is_concrete_element(e)] | |
# Produce start sets and (analogous) finish sets | |
start_contains = collections.defaultdict(set) | |
start_includes = collections.defaultdict(set) | |
finish_contains = collections.defaultdict(set) | |
finish_includes = collections.defaultdict(set) | |
for nt, nt_def in states.grammar.nonterminals.items(): | |
for prod in nt_def.rhs_list: | |
if prod.body and isinstance(prod.body[-1], grammar.ErrorSymbol): | |
continue | |
rhs = strip_body(prod.body) | |
for e in rhs: | |
if isinstance(e, str): | |
start_contains[nt].add(e) | |
break | |
elif isinstance(e, grammar.Nt): | |
start_includes[nt].add(e) | |
if e not in empty_nts: | |
break | |
else: | |
break | |
# Gather information for finish set | |
for e in reversed(rhs): | |
if isinstance(e, str): | |
finish_contains[nt].add(e) | |
break | |
elif isinstance(e, grammar.Nt): | |
finish_includes[nt].add(e) | |
if e not in empty_nts: | |
break | |
else: | |
break | |
start = walk_graph.least_sets(start_contains, start_includes) | |
finish = walk_graph.least_sets(finish_contains, finish_includes) | |
for nt in sorted(finish.keys(), key=repr): | |
print(nt, "starts with", sorted(start[nt])) | |
print(nt, "ends with", sorted(finish[nt])) | |
# from stackoverflow | |
from itertools import islice | |
def window(seq, n=2): | |
"Returns a sliding window (of width n) over data from the iterable" | |
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... " | |
it = iter(seq) | |
result = tuple(islice(it, n)) | |
if len(result) == n: | |
yield result | |
for elem in it: | |
result = result[1:] + (elem,) | |
yield result | |
def which_can_appear_before(t): | |
# elements that can start with `t` | |
targets = {t} | {nt for nt in start if t in start[nt]} | |
# elements that occur just before an element that can start with `t` | |
preceders = { | |
e1 | |
for nt, nt_def in states.grammar.nonterminals.items() | |
for prod in nt_def.rhs_list | |
for e1, e2 in window(strip_body(prod.body)) | |
if e2 in targets | |
} | |
def expand_element(e): | |
if e in states.grammar.nonterminals: | |
return {e2 for e1 in finish[e] for e2 in expand_element(e1)} | |
elif e in ECMASCRIPT_SYNTHETIC_TERMINALS: | |
return ECMASCRIPT_SYNTHETIC_TERMINALS[e] | |
else: | |
return [e] | |
return { | |
t | |
for e in preceders | |
for t in expand_element(e) | |
} | |
print("nonterminals that can begin with RegularExpressionLiteral:") | |
for nt in sorted(start.keys(), key=repr): | |
if 'RegularExpressionLiteral' in start[nt]: | |
print(nt) | |
print() | |
print("things that can appear before RegularExpressionLiteral:") | |
re_prec = which_can_appear_before('RegularExpressionLiteral') | |
for e in sorted(re_prec, key=repr): | |
print(e) | |
print() | |
print("things that can appear before `/`:") | |
div_prec = which_can_appear_before('/') | |
for e in sorted(div_prec, key=repr): | |
print(e) | |
print() | |
print("both:") | |
for e in sorted(re_prec & div_prec, key=repr): | |
print(e) | |
print() | |
def main(): | |
# Read command-line options. | |
parser = argparse.ArgumentParser( | |
description='Ponder the ECMAScript grammar.', | |
allow_abbrev=False) | |
default_filename = os.path.join(os.path.dirname(__file__), | |
"es-simplified.esgrammar") | |
parser.add_argument( | |
'filename', metavar='FILE', nargs='?', default=default_filename, | |
help=".esgrammar (or .jsparagus_dump) input file") | |
parser.add_argument( | |
'-o', '--output', metavar='FILE', default='/dev/stdout', | |
help="output filename for parser tables") | |
parser.add_argument( | |
'-v', '--verbose', action='store_true', | |
help="print some debug output") | |
parser.add_argument( | |
'--progress', action='store_true', | |
help="print a dot each time a state is analyzed (thousands of them)") | |
args = parser.parse_args() | |
# Check filenames. | |
in_filename = args.filename | |
if in_filename.endswith('.esgrammar'): | |
from_source = True | |
elif in_filename.endswith('.jsparagus_dump'): | |
from_source = False | |
else: | |
raise ValueError("input file extension should be .esgrammar or .jsparagus_dump") | |
out_filename = args.output | |
if out_filename.endswith('.py'): | |
target = 'python' | |
elif out_filename.endswith('.rs'): | |
target = 'rust' | |
elif out_filename.endswith('.jsparagus_dump'): | |
target = 'dump' | |
else: | |
raise ValueError("-o file extension should be .py, .rs, or .jsparagus_dump") | |
# Load input and analyze it. | |
if from_source: | |
with open(in_filename) as f: | |
text = f.read() | |
grammar = parse_esgrammar( | |
text, | |
filename=args.filename, | |
goals=ECMASCRIPT_GOAL_NTS, | |
synthetic_terminals=ECMASCRIPT_SYNTHETIC_TERMINALS) | |
grammar = hack_grammar(grammar) | |
if args.verbose: | |
grammar.dump() | |
states = jsparagus.gen.generate_parser_states( | |
grammar, verbose=args.verbose, progress=args.progress) | |
else: | |
states = jsparagus.gen.ParserStates.load(in_filename) | |
investigate2(states) | |
# Generate output. | |
try: | |
if target in ('python', 'rust'): | |
with open(out_filename, 'w') as f: | |
jsparagus.gen.generate_parser(f, states, | |
target=target, | |
verbose=args.verbose) | |
else: | |
assert target == 'dump' | |
states.save(out_filename) | |
except Exception: | |
# On failure, don't leave a partial output file lying around. | |
try: | |
os.remove(out_filename) | |
except Exception: | |
pass | |
raise | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment