|
#!/usr/bin/env python |
|
# -*- coding: utf-8 -*- |
|
|
|
# cy.py: a traspiler for an alternate syntax for C. |
|
# Note: this is an incomplete work-in-progress. |
|
|
|
|
|
# Data structures: |
|
|
|
# A token is a tuple. |
|
# First element is a string 'token'. |
|
# Second element is a string which is the token type (e.g. 'OBRACKET'). |
|
# Third element is a string which is the source text of the token (e.g. '['). |
|
# e.g. ('token', 'OBRACKET', '[') |
|
# e.g. ('token', 'IDENTIFIER', 'foo') |
|
# e.g. ('token', 'WS', ' ') |
|
|
|
|
|
# An AST node is a tuple. |
|
# First element is a string 'ast'. |
|
# Second element is a string which is the AST node type (e.g. 'vardecl'). |
|
# Third element is an array which are the child tuples (AST nodes or tokens). |
|
# e.g. ('ast', 'type', [('token', 'IDENTIFIER', 'int')]) |
|
|
|
|
|
# Grammar: |
|
# |
|
# program = statement [WS] { statement [WS] } |
|
# statement = declassign | assign | fundecl | vardecl | scope | return |
|
# fundecl = FUNC WS IDENTIFIER fundeclargs [ fundeclret ] scope |
|
# fundeclargs = OPAREN [ [WS] vardecl { COMMA WS vardecl } [WS] ] CPAREN |
|
# fundeclret = WS ARROW WS type |
|
# vardecl = IDENTIFIER COLON WS type |
|
# scope = COLON INDENT statement { DENT statement } DEDENT |
|
# return = RETURN [ expr ] |
|
# declassign = vardecl WS EQUAL WS expr |
|
# assign = IDENTIFIER WS EQUAL WS expr |
|
# expr = INTLIT | FLOATLIT | STRINGLIT | BOOLLIT |
|
# type = POINTER LT type GT |
|
# | ARRAY LT type GT |
|
# | FUNCTION LT type GT |
|
# | ARRAY OBRACKET INTLIT CBRACKET LT type GT |
|
# | IDENTIFIER |
|
# |
|
# Note: lower case are AST nodes. |
|
# Note: ALL CAPS are tokens (terminals). |
|
|
|
|
|
# Token data structure utils: |
|
|
|
def is_token(token): |
|
"""Returns whether the arg is a token.""" |
|
return isinstance(token, tuple) and len(token) > 0 and token[0] == 'token' |
|
|
|
|
|
def token_type(token): |
|
"""Returns the type of the token (e.g. 'IDENTIFIER').""" |
|
assert is_token(token) |
|
return token[1] |
|
|
|
|
|
def token_text(token): |
|
"""Returns the source text of the token (e.g. 'foo' for a 'IDENTIFIER' token).""" |
|
assert is_token(token) |
|
return token[2] |
|
|
|
|
|
def is_toktype(token, toktype): |
|
""" |
|
Returns whether the token is of the given type. |
|
_toktype_ is e.g. 'IDENTIFIER', 'COLON', etc. |
|
""" |
|
assert is_token(token) |
|
return token_type(token) == toktype |
|
|
|
|
|
|
|
# Tokenizer implementation: |
|
|
|
def load_tokendefs(fpath): |
|
""" |
|
Load the token definitions from the file at fpath. |
|
|
|
Returns a list of tuples of the form (<token type>, <compiled regex>). |
|
|
|
The format of the tokendefs file should be pairs of lines: |
|
- a line which is the token type (e.g. 'IDENTIFIER', 'OBRACKET', etc), |
|
- followed by a line which is the regex which recognizes that token. |
|
|
|
Example tokendefs.txt: |
|
IDENTIFIER |
|
[a-zA-Z][a-zA-Z0-9]* |
|
OPAREN |
|
\( |
|
CPAREN |
|
) |
|
""" |
|
import re |
|
tokendefs = [] |
|
with open(fpath) as f: |
|
for line in f: |
|
token_name = line.rstrip('\n') |
|
pattern = f.next().rstrip('\n') |
|
regex = re.compile(pattern) |
|
tokendef = (token_name, regex) |
|
tokendefs.append(tokendef) |
|
return tokendefs |
|
|
|
|
|
def offside_rule(token, previous, indent): |
|
""" |
|
Implement the "offside rule" indentation-based scoping mechanism. |
|
See https://en.wikipedia.org/wiki/Off-side_rule |
|
|
|
If this is a WS (containing a newline) following a COLON and indentation has increased, |
|
sub in an INDENT token. |
|
If this is a WS (containing a newline) and indentation has decreased, |
|
sub in a DEDENT token. |
|
If this is a WS (containing a newline) and indentation hasn't changed, |
|
sub in a DENT token. |
|
Otherwise, return the token unmodified. |
|
""" |
|
assert is_token(token) |
|
assert is_token(previous) or previous is None |
|
text = token_text(token) |
|
if is_toktype(token, 'WS') and '\n' in text: |
|
indentation = text.split('\n')[-1] |
|
if is_toktype(previous, 'COLON') and len(indentation) > indent: |
|
token = ('token', 'INDENT', text) |
|
indent += 1 |
|
elif len(indentation) < indent: |
|
token = ('token', 'DEDENT', text) |
|
indent -= 1 |
|
assert indent >= 0 |
|
else: |
|
if indent > 0: |
|
token = ('token', 'DENT', text) |
|
return (token, indent) |
|
|
|
|
|
def tokenize(tokendefs, input, indent=0): |
|
"""Uses tokendefs to tokenize the 'input' string and return a list of tokens""" |
|
tokens = [] |
|
offset = 0 |
|
previous = None |
|
while offset < len(input): |
|
for (token_name, regex) in tokendefs: |
|
m = regex.match(input, offset) |
|
if m is not None: |
|
matched_text = m.group(0) |
|
offset += len(matched_text) |
|
token = ('token', token_name, matched_text) |
|
(token, indent) = offside_rule(token, previous, indent) |
|
if token is not None: |
|
tokens.append(token) |
|
previous = token |
|
break |
|
else: |
|
raise Exception("Couldn't tokenize starting at '%s...'" % input[offset:offset+16]) |
|
return tokens |
|
|
|
|
|
|
|
# AST node data structure utils: |
|
|
|
def is_ast(ast): |
|
"""Returns whether the arg is an AST node.""" |
|
return isinstance(ast, tuple) and len(ast) > 0 and ast[0] == 'ast' |
|
|
|
|
|
def is_asttype(ast, asttype): |
|
""" |
|
Returns whether the AST node is of the given type. |
|
_asttype_ is e.g. 'vardecl', 'statement', etc. |
|
""" |
|
assert is_ast(ast) |
|
return ast[1] == asttype |
|
|
|
|
|
def ast_children(ast): |
|
"""Return the child nodes of the given AST.""" |
|
assert is_ast(ast) |
|
return ast[2] |
|
|
|
|
|
# Parser implementation: |
|
|
|
# Note: all parse_x functions share a similar format: |
|
# - Their first argument is the list of tokens. |
|
# - They return a tuple of (<parsed result>, <remaining tokens>). |
|
# - <parsed result> is either an AST node or a token (a terminal). |
|
# - <remaining tokens> is the list of unconsumed tokens. |
|
# - If the parse fails, (None, <tokens>) is returned, where <tokens> |
|
# is the list of tokens which was passed in. |
|
|
|
def alternation(tokens, funcs): |
|
"""Try each of the parsing functions until one succeeds.""" |
|
failure = (None, tokens) |
|
for f in funcs: |
|
(ast, tokens) = f(tokens) |
|
if ast is not None: |
|
return (ast, tokens) |
|
return failure |
|
|
|
|
|
def parse_terminal(tokens, toktype): |
|
""" |
|
Attempts to parse a terminal node of type _toktype_. |
|
Note that the parsed result of a terminal is the token itself, not an AST node. |
|
""" |
|
failure = (None, tokens) |
|
if len(tokens) > 0 and is_toktype(tokens[0], toktype): |
|
return (tokens[0], tokens[1:]) |
|
else: |
|
return failure |
|
|
|
|
|
# fundecl examples: |
|
# func f(): |
|
# func f(a: int): |
|
# func f() -> int: |
|
# func f(a: int) -> int: |
|
# func f(a: int, b: int): |
|
# func f(a: int, b: int) -> int: |
|
def parse_fundecl(tokens): |
|
""" |
|
Attempts to parse a function declaration. |
|
|
|
Grammar: |
|
fundecl = FUNC WS IDENTIFIER fundeclargs [ fundeclret ] scope |
|
""" |
|
failure = (None, tokens) |
|
(ast, tokens) = parse_terminal(tokens, 'FUNC') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return failure |
|
(identifier_token, tokens) = parse_terminal(tokens, 'IDENTIFIER') |
|
if identifier_token is None: |
|
return failure |
|
(args_ast, tokens) = parse_fundeclargs(tokens) |
|
if args_ast is None: |
|
return failure |
|
(ret_ast, tokens) = parse_fundeclret(tokens) |
|
(scope_ast, tokens) = parse_scope(tokens) |
|
if scope_ast is None: |
|
return failure |
|
ast = ('ast', 'fundecl', [identifier_token, args_ast, ret_ast, scope_ast]) |
|
return (ast, tokens) |
|
|
|
|
|
def parse_fundeclargs(tokens): |
|
""" |
|
Attempts to parse a list of function declaration arguments. |
|
|
|
Grammar: |
|
fundeclargs = OPAREN [ [WS] vardecl { COMMA WS vardecl } [WS] ] CPAREN |
|
""" |
|
failure = (None, tokens) |
|
(ast, tokens) = parse_terminal(tokens, 'OPAREN') |
|
if ast is None: |
|
return failure |
|
|
|
args = [] |
|
(_, tokens) = parse_terminal(tokens, 'WS') |
|
(arg, tokens) = parse_vardecl(tokens) |
|
if arg is not None: |
|
args.append(arg) |
|
while True: |
|
(ast, tokens) = parse_terminal(tokens, 'COMMA') |
|
if ast is None: |
|
break |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
break |
|
(arg, tokens) = parse_vardecl(tokens) |
|
if ast is None: |
|
break |
|
args.append(arg) |
|
|
|
(ast, tokens) = parse_terminal(tokens, 'CPAREN') |
|
if ast is None: |
|
return failure |
|
|
|
ast = ('ast', 'fundeclargs', args) |
|
return (ast, tokens) |
|
|
|
|
|
def parse_fundeclret(tokens): |
|
""" |
|
Attempts to parse a function declaration return value. |
|
|
|
Grammar: |
|
fundeclret = WS ARROW WS type |
|
""" |
|
failure = (None, tokens) |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'ARROW') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_type(tokens) |
|
if ast is None: |
|
return failure |
|
|
|
ast = ('ast', 'fundeclret', [ast]) |
|
return (ast, tokens) |
|
|
|
|
|
def parse_scope(tokens): |
|
""" |
|
Attempts to parse a scoped sequence of statements. |
|
|
|
Grammar: |
|
scope = COLON INDENT statement { DENT statement } DEDENT |
|
""" |
|
failure = (None, tokens) |
|
(ast, tokens) = parse_terminal(tokens, 'COLON') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'INDENT') |
|
if ast is None: |
|
return failure |
|
|
|
statements = [] |
|
(ast, tokens) = parse_statement(tokens) |
|
if ast is None: |
|
return failure |
|
statements.append(ast) |
|
while True: |
|
(ast, tokens) = parse_terminal(tokens, 'DENT') |
|
if ast is None: |
|
break |
|
(ast, tokens) = parse_statement(tokens) |
|
if ast is None: |
|
break |
|
statements.append(ast) |
|
|
|
(ast, tokens) = parse_terminal(tokens, 'DEDENT') |
|
if ast is None: |
|
return failure |
|
|
|
ast = ('ast', 'scope', statements) |
|
return (ast, tokens) |
|
|
|
|
|
def parse_vardecl(tokens): |
|
""" |
|
Attempts to parse a vardecl AST node. |
|
|
|
Grammar: |
|
vardecl = IDENTIFIER COLON WS type |
|
""" |
|
failure = (None, tokens) |
|
(identifier_token, tokens) = parse_terminal(tokens, 'IDENTIFIER') |
|
if identifier_token is None: |
|
return failure |
|
(colon_token, tokens) = parse_terminal(tokens, 'COLON') |
|
if colon_token is None: |
|
return failure |
|
(wspace_token, tokens) = parse_terminal(tokens, 'WS') |
|
if wspace_token is None: |
|
return failure |
|
(type_ast, tokens) = parse_type(tokens) |
|
if type_ast is None: |
|
return failure |
|
vardecl_ast = ('ast', 'vardecl', [identifier_token, type_ast]) |
|
return (vardecl_ast, tokens) |
|
|
|
|
|
def parse_type(tokens): |
|
""" |
|
Attemps to parse a type declaration. |
|
|
|
Grammar: |
|
type = POINTER LT type GT |
|
| ARRAY LT type GT |
|
| FUNCTION LT type GT |
|
| ARRAY OBRACKET INTLIT CBRACKET LT type GT |
|
| IDENTIFIER |
|
""" |
|
|
|
def parse_type1(tokens, toktype): |
|
""" |
|
Grammar fragments: |
|
POINTER LT type GT |
|
ARRAY LT type GT |
|
|
|
'array<int>' becomes: |
|
('ast', 'type', [ |
|
('token', 'ARRAY', 'array'), |
|
('ast', 'type', [ |
|
('token', 'IDENTIFIER', 'int') |
|
]) |
|
]) |
|
|
|
'pointer<char>' becomes: |
|
('ast', 'type', [ |
|
('token', 'POINTER', 'pointer'), |
|
('ast', 'type', [ |
|
('token', 'IDENTIFIER', 'char') |
|
]) |
|
]) |
|
|
|
'array<pointer<char>>' becomes: |
|
('ast', 'type', [ |
|
('token', 'ARRAY', 'array'), |
|
('ast', 'type', [ |
|
('token', 'POINTER', 'pointer'), |
|
('ast', 'type', [ |
|
('token', 'IDENTIFIER', 'char') |
|
]) |
|
]) |
|
]) |
|
""" |
|
failure = (None, tokens) |
|
(identifier_token, tokens) = parse_terminal(tokens, toktype) |
|
if identifier_token is None: |
|
return failure |
|
(lessthan, tokens) = parse_terminal(tokens, 'LT') |
|
if lessthan is None: |
|
return failure |
|
(subtype_ast, tokens) = parse_type(tokens) |
|
if subtype_ast is None: |
|
return failure |
|
(greaterthan, tokens) = parse_terminal(tokens, 'GT') |
|
if greaterthan is None: |
|
return failure |
|
ast = ('ast', 'type', [identifier_token, subtype_ast]) |
|
return (ast, tokens) |
|
|
|
def parse_type2(tokens): |
|
""" |
|
Grammar fragment: |
|
ARRAY OBRACKET INTLIT CBRACKET LT type GT |
|
|
|
'array[8]<int>' becomes: |
|
('ast', 'type', [ |
|
('token', 'ARRAY', 'array'), |
|
('token', 'INTLIT', '8'), |
|
('ast', 'type', [ |
|
('token', 'IDENTIFIER', 'int') |
|
]) |
|
]) |
|
""" |
|
failure = (None, tokens) |
|
(identifier_token, tokens) = parse_terminal(tokens, 'ARRAY') |
|
if identifier_token is None: |
|
return failure |
|
(obracket, tokens) = parse_terminal(tokens, 'OBRACKET') |
|
if obracket is None: |
|
return failure |
|
(intlit, tokens) = parse_terminal(tokens, 'INTLIT') |
|
if intlit is None: |
|
return failure |
|
(cbracket, tokens) = parse_terminal(tokens, 'CBRACKET') |
|
if obracket is None: |
|
return failure |
|
(lessthan, tokens) = parse_terminal(tokens, 'LT') |
|
if lessthan is None: |
|
return failure |
|
(subtype_ast, tokens) = parse_type(tokens) |
|
if subtype_ast is None: |
|
return failure |
|
(greaterthan, tokens) = parse_terminal(tokens, 'GT') |
|
if greaterthan is None: |
|
return failure |
|
ast = ('ast', 'type', [identifier_token, intlit, subtype_ast]) |
|
return (ast, tokens) |
|
|
|
def parse_type3(tokens): |
|
""" |
|
Grammar fragment: |
|
IDENTIFIER |
|
|
|
'int' becomes: |
|
('ast', 'type', [('token', 'IDENTIFIER', 'int')]) |
|
|
|
'char' becomes: |
|
('ast', 'type', [('token', 'IDENTIFIER', 'char')]) |
|
""" |
|
failure = (None, tokens) |
|
(identifier_token, tokens) = parse_terminal(tokens, 'IDENTIFIER') |
|
if identifier_token is None: |
|
return failure |
|
type_ast = ('ast', 'type', [identifier_token]) |
|
return (type_ast, tokens) |
|
|
|
failure = (None, tokens) |
|
(ast, tokens) = parse_type1(tokens, 'POINTER') |
|
if ast is not None: |
|
return (ast, tokens) |
|
(ast, tokens) = parse_type1(tokens, 'ARRAY') |
|
if ast is not None: |
|
return (ast, tokens) |
|
(ast, tokens) = parse_type1(tokens, 'FUNCTION') |
|
if ast is not None: |
|
return (ast, tokens) |
|
(ast, tokens) = parse_type2(tokens) |
|
if ast is not None: |
|
return (ast, tokens) |
|
(ast, tokens) = parse_type3(tokens) |
|
if ast is not None: |
|
return (ast, tokens) |
|
return failure |
|
|
|
|
|
def parse_return(tokens): |
|
""" |
|
Attempts to parse a return statement. |
|
|
|
Grammar: |
|
return = RETURN [ WS expr ] |
|
""" |
|
failure = (None, tokens) |
|
(ast, tokens) = parse_terminal(tokens, 'RETURN') |
|
if ast is None: |
|
return failure |
|
return_ast = ('ast', 'return', []) |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return return_ast |
|
(expr_ast, tokens) = parse_expr(tokens) |
|
if expr_ast is not None: |
|
return_ast = ('ast', 'return', [expr_ast]) |
|
return (return_ast, tokens) |
|
|
|
|
|
def parse_expr(tokens): |
|
""" |
|
Attempts to parse an expression. |
|
|
|
Grammar: |
|
expr = INTLIT | FLOATLIT | STRINGLIT | BOOLLIT |
|
""" |
|
failure = (None, tokens) |
|
(token, tokens) = parse_terminal(tokens, 'INTLIT') |
|
if token is not None: |
|
ast = ('ast', 'expr', [token]) |
|
return (ast, tokens) |
|
(token, tokens) = parse_terminal(tokens, 'FLOATLIT') |
|
if token is not None: |
|
ast = ('ast', 'expr', [token]) |
|
return (ast, tokens) |
|
(token, tokens) = parse_terminal(tokens, 'STRINGLIT') |
|
if token is not None: |
|
ast = ('ast', 'expr', [token]) |
|
return (ast, tokens) |
|
(token, tokens) = parse_terminal(tokens, 'BOOLLIT') |
|
if token is not None: |
|
ast = ('ast', 'expr', [token]) |
|
return (ast, tokens) |
|
return failure |
|
|
|
|
|
def parse_declassign(tokens): |
|
""" |
|
Attempts to parse a declaration-assignment statement. |
|
|
|
Grammar: |
|
declassign = vardecl WS EQUAL WS expr |
|
""" |
|
failure = (None, tokens) |
|
(vardecl_ast, tokens) = parse_vardecl(tokens) |
|
if vardecl_ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'EQUAL') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return failure |
|
(expr_ast, tokens) = parse_expr(tokens) |
|
if expr_ast is None: |
|
return failure |
|
ast = ('ast', 'declassign', [vardecl_ast, expr_ast]) |
|
return (ast, tokens) |
|
|
|
|
|
def parse_assign(tokens): |
|
""" |
|
Attempts to parse an assignment statement. |
|
|
|
Grammar: |
|
assign = IDENTIFIER WS EQUAL WS expr |
|
""" |
|
failure = (None, tokens) |
|
(identifier_token, tokens) = parse_terminal(tokens, 'IDENTIFIER') |
|
if identifier_token is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'EQUAL') |
|
if ast is None: |
|
return failure |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
if ast is None: |
|
return failure |
|
(expr_ast, tokens) = parse_expr(tokens) |
|
if ast is None: |
|
return failure |
|
ast = ('ast', 'assign', [identifier_token, expr_ast]) |
|
return (ast, tokens) |
|
|
|
|
|
def parse_statement(tokens): |
|
""" |
|
Attempts to parse a statement. |
|
|
|
Grammar: |
|
statement = declassign | assign | fundecl | vardecl | scope | return |
|
""" |
|
failure = (None, tokens) |
|
(ast, tokens) = alternation(tokens, [ |
|
parse_declassign, |
|
parse_assign, |
|
parse_fundecl, |
|
parse_vardecl, |
|
parse_scope, |
|
parse_return |
|
]) |
|
if ast is None: |
|
return failure |
|
statement_ast = ('ast', 'statement', [ast]) |
|
return (statement_ast, tokens) |
|
|
|
|
|
def parse_program(tokens): |
|
""" |
|
Attempts to parse a program. |
|
|
|
Grammar: |
|
statement [WS] { statement [WS] } |
|
""" |
|
failure = (None, tokens) |
|
statement_asts = [] |
|
(ast, tokens) = parse_statement(tokens) |
|
if ast is None: |
|
return failure |
|
statement_asts.append(ast) |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
while True: |
|
(ast, tokens) = parse_statement(tokens) |
|
if ast is None: |
|
break |
|
statement_asts.append(ast) |
|
(ast, tokens) = parse_terminal(tokens, 'WS') |
|
ast = ('ast', 'program', statement_asts) |
|
return (ast, tokens) |
|
|
|
|
|
def parse(tokens): |
|
"""Attempts to parse the tokens into an AST.""" |
|
(ast, tokens) = parse_program(tokens) |
|
if ast is None: |
|
raise Exception("Couldn't parse starting at %s" % tokens[:4]) |
|
if len(tokens) > 0: |
|
raise Exception("Leftover tokens: %s" % tokens) |
|
return ast |
|
|
|
|
|
|
|
# Output generator implementation: |
|
|
|
# Note: all generate_x functions share a similar format: |
|
# - Their first argument is as AST. |
|
# - They return a string (of C code). |
|
# If the passed AST is valid, then errors are not possible. |
|
# Exceptions are thrown otherwise. |
|
|
|
|
|
def generate_type(ast, identifier=""): |
|
""" |
|
Generate a C type declaration. |
|
|
|
If identifier is not given, this is an abstract type declaration (e.g. a cast). |
|
""" |
|
def did_switch_direction(previous, current): |
|
""" |
|
Detect a change in 'direction' while intepreting the type stack. |
|
|
|
C type declaration operator precedence requires that we "go right" first. |
|
i.e. 'int *foo[]' is "array of pointer to int", not "pointer to array of int". |
|
In order to express "pointer to array of int", we have to use parenthesis |
|
to change overcome operator precedence, i.e. 'int (*foo)[]'. |
|
Any time we need to "change direction", we need to wrap in parenthesis. |
|
See http://unixwiz.net/techtips/reading-cdecl.html |
|
""" |
|
lefts = ['pointer'] |
|
rights = ['array', 'function'] |
|
return (previous in lefts and current in rights) \ |
|
or (previous in rights and current in lefts) |
|
|
|
assert is_asttype(ast, 'type') |
|
types = type_ast_as_list(ast) |
|
output = identifier |
|
previous = None |
|
for t in types[:-1]: |
|
if t == 'pointer': |
|
if did_switch_direction(previous, t): |
|
output = "*(%s)" % output |
|
else: |
|
output = "*%s" % output |
|
elif t == 'array': |
|
if did_switch_direction(previous, t): |
|
output = "(%s)[]" % output |
|
else: |
|
output = "%s[]" % output |
|
elif t == 'function': |
|
if did_switch_direction(previous, t): |
|
output = "(%s)()" % output |
|
else: |
|
output = "%s()" % output |
|
elif t.startswith('array:'): |
|
array_size = int(t.split(':')[1]) |
|
if did_switch_direction(previous, 'array'): |
|
output = "(%s)[%i]" % (output, array_size) |
|
else: |
|
output = "%s[%i]" % (output, array_size) |
|
else: |
|
raise Exception("generate_type: unexpected type '%s'." % t) |
|
|
|
if t.startswith('array:'): |
|
previous = 'array' |
|
else: |
|
previous = t |
|
|
|
base_type = types[-1] |
|
if identifier == "": |
|
output = "%s%s" % (base_type, output) |
|
else: |
|
output = "%s %s" % (base_type, output) |
|
return output |
|
|
|
|
|
def type_ast_as_list(ast): |
|
""" |
|
Return the type AST as a list of types. |
|
|
|
e.g. |
|
- 'int' results in ['int'] |
|
- 'pointer<array<char>>' results in ['pointer', 'array', 'char'] |
|
- 'array[8]<int>' results in ['array:8', 'int'] |
|
""" |
|
assert is_asttype(ast, 'type') |
|
children = ast_children(ast) |
|
if len(children) == 1: |
|
# if there is only one child, it is the "base" type (e.g. int, char, etc). |
|
assert is_token(children[0]) |
|
token = children[0] |
|
assert is_toktype(token, 'IDENTIFIER') |
|
# return e.g. ['int'], ['char'], etc. |
|
return [token_text(token)] |
|
elif len(children) == 2: |
|
# if there are 2 children, this is either array or pointer or function. |
|
assert is_toktype(children[0], 'ARRAY') \ |
|
or is_toktype(children[0], 'POINTER') \ |
|
or is_toktype(children[0], 'FUNCTION') |
|
token = children[0] |
|
assert is_ast(children[1]) |
|
sub_ast = children[1] |
|
assert is_asttype(ast, 'type') |
|
# return e.g. ['array', <recursive call>] |
|
return [token_text(token)] + type_ast_as_list(sub_ast) |
|
elif len(children) == 3: |
|
# if there are three children, this is a dimensioned array (e.g. array[8]). |
|
assert is_toktype(children[0], 'ARRAY') |
|
assert is_toktype(children[1], 'INTLIT') |
|
intlit_token = children[1] |
|
array_size = int(token_text(intlit_token)) |
|
assert is_ast(children[2]) |
|
sub_ast = children[2] |
|
assert is_asttype(sub_ast, 'type') |
|
# return e.g. ['array:8', <recursive call>] |
|
return ["array:%s" % (array_size)] + type_ast_as_list(sub_ast) |
|
else: |
|
raise Exception("type_ast_as_list: type AST node with more than 3 children!") |
|
|
|
|
|
def generate_vardecl(ast): |
|
""" |
|
Generate a C variable declaration. |
|
|
|
Note: we don't append the trailing ';' here because this is also used for function args. |
|
""" |
|
assert is_asttype(ast, 'vardecl') |
|
children = ast_children(ast) |
|
|
|
identifier_token = children[0] |
|
assert is_toktype(identifier_token, 'IDENTIFIER') |
|
var_name = token_text(identifier_token) |
|
|
|
type_ast = children[1] |
|
assert is_asttype(type_ast, 'type') |
|
|
|
# e.g. 'int a', 'char* b', 'int c[8]', 'char** d', 'int* d[3]', etc. |
|
return "%s" % generate_type(type_ast, var_name) |
|
|
|
|
|
def generate_fundecl(ast): |
|
"""Generate a C function declaration.""" |
|
assert is_asttype(ast, 'fundecl') |
|
|
|
identifier_token = ast_children(ast)[0] |
|
assert is_toktype(identifier_token, 'IDENTIFIER') |
|
identifier = token_text(identifier_token) |
|
|
|
fundeclargs_ast = ast_children(ast)[1] |
|
assert is_asttype(fundeclargs_ast, 'fundeclargs') |
|
arg_asts = ast_children(fundeclargs_ast) |
|
args = [] |
|
for vardecl_ast in arg_asts: |
|
arg = generate_vardecl(vardecl_ast) |
|
args.append(arg) |
|
args_output = ", ".join(args) |
|
|
|
fundeclret_ast = ast_children(ast)[2] |
|
if fundeclret_ast is None: |
|
ret = "void" |
|
else: |
|
ret_ast = ast_children(fundeclret_ast)[0] |
|
ret = generate_type(ret_ast) |
|
|
|
scope_ast = ast_children(ast)[3] |
|
scope = generate_scope(scope_ast) |
|
|
|
output = "%s %s(%s)%s" % (ret, identifier, args_output, scope) |
|
return output |
|
|
|
|
|
def generate_scope(ast, indent=0): |
|
"""Generate a C scope.""" |
|
assert is_ast(ast) |
|
assert is_asttype(ast, 'scope') |
|
output = " {" |
|
for statement_ast in ast_children(ast): |
|
statement = generate_statement(statement_ast) |
|
line = "\n" + (" " * 4 * (indent+1)) + statement |
|
output += line |
|
line = "\n" + (" " * 4 * (indent)) + "}" |
|
output += line |
|
return output |
|
|
|
|
|
def generate_expr(ast): |
|
"""Generate a C expression.""" |
|
assert is_asttype(ast, 'expr') |
|
child = ast_children(ast)[0] |
|
if is_token(child): |
|
token = child |
|
if is_toktype(token, 'INTLIT') \ |
|
or is_toktype(token, 'FLOATLIT') \ |
|
or is_toktype(token, 'STRINGLIT') \ |
|
or is_toktype(token, 'BOOLLIT') \ |
|
: |
|
output = token_text(token) |
|
return output |
|
raise Exception("generate_expr: expected an INTLIT, got %s" % (ast,)) |
|
|
|
|
|
def generate_return(ast): |
|
"""Generate a C return statement.""" |
|
assert is_asttype(ast, 'return') |
|
children = ast_children(ast) |
|
if len(children) == 0: |
|
return "return;" |
|
else: |
|
expr_ast = children[0] |
|
return "return %s;" % generate_expr(expr_ast) |
|
|
|
|
|
def generate_declassign(ast): |
|
"""Generate a C declaration-assignment statement.""" |
|
assert is_asttype(ast, 'declassign') |
|
children = ast_children(ast) |
|
vardecl_ast = children[0] |
|
assert is_asttype(vardecl_ast, 'vardecl') |
|
vardecl = generate_vardecl(vardecl_ast) |
|
expr_ast = children[1] |
|
assert is_asttype(expr_ast, 'expr') |
|
expr = generate_expr(expr_ast) |
|
return "%s = %s;" % (vardecl, expr) |
|
|
|
|
|
def generate_assign(ast): |
|
"""Generate a C assignment statement.""" |
|
assert is_asttype(ast, 'assign') |
|
children = ast_children(ast) |
|
identifier_token = children[0] |
|
assert is_toktype(identifier_token, 'IDENTIFIER') |
|
identifier = token_text(identifier_token) |
|
expr_ast = children[1] |
|
assert is_asttype(expr_ast, 'expr') |
|
expr = generate_expr(expr_ast) |
|
return "%s = %s;" % (identifier, expr) |
|
|
|
|
|
def generate_statement(ast): |
|
"""Generate a C statement.""" |
|
assert is_asttype(ast, 'statement') |
|
sub_ast = ast_children(ast)[0] |
|
assert is_ast(sub_ast) |
|
if is_asttype(sub_ast, 'fundecl'): |
|
return generate_fundecl(sub_ast) |
|
elif is_asttype(sub_ast, 'vardecl'): |
|
return generate_vardecl(sub_ast) + ';' |
|
elif is_asttype(sub_ast, 'scope'): |
|
return generate_scope(sub_ast) |
|
elif is_asttype(sub_ast, 'return'): |
|
return generate_return(sub_ast) |
|
elif is_asttype(sub_ast, 'declassign'): |
|
return generate_declassign(sub_ast) |
|
elif is_asttype(sub_ast, 'assign'): |
|
return generate_assign(sub_ast) |
|
else: |
|
raise Exception("generate_statement: don't know how to generate %s" % sub_ast) |
|
|
|
|
|
def generate_program(ast): |
|
"""Generate a C program.""" |
|
assert is_asttype(ast, 'program') |
|
outputs = [] |
|
for statement_ast in ast_children(ast): |
|
outputs.append(generate_statement(statement_ast)) |
|
return '\n'.join(outputs) + '\n' |
|
|
|
|
|
def generate(ast): |
|
"""Generate C code from the given AST.""" |
|
return generate_program(ast) |
|
|
|
|
|
if __name__ == "__main__": |
|
import sys |
|
import pprint |
|
tdefs = load_tokendefs("tokendefs.txt") |
|
input = None |
|
if len(sys.argv) > 1: |
|
input = open(sys.argv[-1]).read() |
|
else: |
|
input = sys.stdin.read() |
|
tokens = tokenize(tdefs, input) |
|
if '--tokens' in sys.argv: |
|
pprint.pprint(tokens) |
|
sys.exit(0) |
|
ast = parse(tokens) |
|
if '--ast' in sys.argv: |
|
pprint.pprint(ast) |
|
sys.exit(0) |
|
output = generate(ast) |
|
sys.stdout.write(output) |