|
#!/usr/bin/env python |
|
|
|
import re |
|
import argparse |
|
from collections import namedtuple |
|
from graphviz import Digraph |
|
|
|
# Utility types to keep things a little bit neater |
|
Token = namedtuple('Token', ['token', 'value']) |
|
Rule = namedtuple('Rule', ['name', 'expression']) |
|
Grammar = namedtuple('Grammar', ['root', 'rules']) |
|
Tree = namedtuple('Tree', ['root', 'children', 'length']) |
|
|
|
terminals = ['identifier', 'integer', 'symbol', 'literal'] |
|
|
|
|
|
def tokenize(s): |
|
''' Return a list of tokens from a string ''' |
|
tokens = [] |
|
offset = 0 # Offset into string |
|
s = re.sub(r'%[^\n]*\n', '', s) # Remove comments |
|
while offset < len(s): |
|
# Newlines and spaces (ignored) |
|
if s[offset] == '\n' or s[offset] == ' ': |
|
offset = offset + 1 |
|
continue |
|
# Identifiers |
|
match = re.match(r'^[a-z_A-Z][a-zA-Z0-9]*', s[offset:]) |
|
if match is not None: |
|
tokens.append(Token('identifier', match.group())) |
|
offset = offset + len(match.group()) |
|
continue |
|
# Integers |
|
match = re.match(r'^[1-9][0-9]*|0', s[offset:]) |
|
if match is not None: |
|
tokens.append(Token('integer', match.group())) |
|
offset = offset + len(match.group()) |
|
continue |
|
# Symbols |
|
match = re.match(r'^[=|;:!<>\+\-\*\/\(\)]+', s[offset:]) |
|
if match is not None: |
|
tokens.append(Token('symbol', match.group())) |
|
offset = offset + len(match.group()) |
|
continue |
|
# Literals |
|
match = re.match(r'^"([^"\n]*)"', s[offset:]) |
|
if match is not None: |
|
tokens.append(Token('literal', match.groups()[0])) |
|
offset = offset + len(match.group()) |
|
continue |
|
# Error case |
|
raise ValueError('Unable to tokenize: {}'.format(s[offset:])) |
|
return tokens |
|
|
|
|
|
def is_term(token, symbol): |
|
''' Return true if token is the given terminal symbol ''' |
|
return token.token == 'symbol' and token.value == symbol |
|
|
|
|
|
def parse_grammar(tokens): |
|
''' Parse a list of tokens into a BNF grammar ''' |
|
rules = {} |
|
total = 0 |
|
first_rule = None |
|
while total < len(tokens): |
|
# Parse rule name |
|
assert tokens[total].token == 'identifier', '{} {}'.format( |
|
tokens[total], total) |
|
name = tokens[total].value |
|
if first_rule is None: |
|
first_rule = name |
|
total += 1 |
|
# Parse assignment symbol |
|
assert is_term(tokens[total], '=') |
|
total += 1 |
|
# Parse expression |
|
expression = [] |
|
while True: |
|
# Parse list |
|
list_ = [] |
|
while not (is_term(tokens[total], '|') or is_term(tokens[total], ';')): |
|
list_.append(tokens[total].value) |
|
total += 1 |
|
expression.append(list_) |
|
if is_term(tokens[total], ';'): |
|
break |
|
if is_term(tokens[total], '|'): |
|
total += 1 |
|
# Parse terminating semicolon |
|
assert is_term(tokens[total], ';') |
|
total += 1 |
|
# Add rule to list |
|
rules[name] = expression |
|
return Grammar(first_rule, rules) |
|
|
|
|
|
def parse(grammar, tokens, items=None): |
|
''' Given a grammar, parse a list of tokens into syntax trees given nonterminals ''' |
|
if items is None: |
|
items = [grammar.root] |
|
if len(items) == 0: |
|
return [] |
|
if len(tokens) == 0: |
|
return None |
|
|
|
item, *tail = items |
|
|
|
if item not in grammar.rules: |
|
if ((item in terminals and tokens[0].token == item) or |
|
(item == tokens[0].value and tokens[0].token in terminals)): |
|
tail_trees = parse(grammar, tokens[1:], tail) |
|
if tail_trees is not None: |
|
root = item |
|
if item == tokens[0].token and tokens[0].token != 'literal': |
|
root = root + '\n' + tokens[0].value |
|
return [Tree(root, [], 1)] + tail_trees |
|
return None |
|
|
|
for exp in grammar.rules[item]: |
|
exp_trees = parse(grammar, tokens, exp) |
|
if exp_trees is None: |
|
continue |
|
offset = sum(tree.length for tree in exp_trees) |
|
tail_trees = parse(grammar, tokens[offset:], tail) |
|
if tail_trees is None: |
|
continue |
|
return [Tree(item, exp_trees, offset)] + tail_trees |
|
return None |
|
|
|
|
|
def render(tree, filepath): |
|
''' Draw the syntax tree as a graph to an output image file ''' |
|
dot = Digraph(format='png') |
|
to_visit = [tree] |
|
while to_visit: |
|
node = to_visit.pop(0) |
|
dot.node(str(id(node)), label=str(node.root)) |
|
for child in node.children: |
|
if isinstance(child, Tree): |
|
dot.edge(str(id(node)), str(id(child))) |
|
to_visit.append(child) |
|
dot.render(filepath, cleanup=True) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Make syntax tree graphs') |
|
parser.add_argument('grammar_file', help='BNF-encoded grammar to parse') |
|
parser.add_argument('program_file', help='program to parse with grammar') |
|
parser.add_argument('output_image', help='image file to save syntax tree') |
|
args = parser.parse_args() |
|
|
|
grammar = parse_grammar(tokenize(open(args.grammar_file).read())) |
|
program = tokenize(open(args.program_file).read()) |
|
syntax, = parse(grammar, program) |
|
render(syntax, args.output_image) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
Nice. Thank you!