Last active
April 3, 2024 14:50
-
-
Save millerdev/03d881620dd0879d4347b0e971ff0be5 to your computer and use it in GitHub Desktop.
Pratt Parsers: Expression Parsing Made Easy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pratt Parser in Python | |
# https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/ | |
# | |
# Works with input like `a + b`. | |
# The lexer does not support numbers (yet). | |
# | |
# https://gist.github.com/millerdev/03d881620dd0879d4347b0e971ff0be5 | |
from dataclasses import dataclass | |
def main(): | |
while True: | |
print("Type an expression:") | |
try: | |
expr = input("> ") | |
if not expr: | |
raise EOFError | |
except EOFError: | |
print() | |
break | |
print("AST:", parse(lex(expr))) | |
print() | |
def parse(tokens): | |
parser = Parser(tokens) | |
return parse_expression(parser, 0) | |
def parse_expression(parser, precedence): | |
token = parser.consume() | |
parse_prefix = PREFIX_PARSERS.get(token.type) | |
if parse_prefix is None: | |
raise ParseError(f"Could not parse: {token}") | |
left = parse_prefix(parser, token) | |
while precedence < get_precedence(parser): | |
token = parser.consume() | |
parse_infix = INFIX_PARSERS[token.type] | |
left = parse_infix(parser, left, token) | |
return left | |
def get_precedence(parser): | |
token = parser.look_ahead() | |
parse = INFIX_PARSERS.get(token.type) | |
return 0 if parse is None else parse.precedence | |
def parse_name(parser, token): | |
return NameExpression(token.value) | |
def parse_prefix_expression(parser, token, *, precedence): | |
operand = parse_expression(parser, precedence) | |
return PrefixExpression(token.value, operand) | |
def parse_binary_expression(parser, left, token, *, precedence): | |
right = parse_expression(parser, precedence) | |
return OperatorExpression(left, token.type, right) | |
def parse_postfix_expression(parser, left, token, *, precedence): | |
return PostfixExpression(left, token.type) | |
def parse_conditional_expression(parser, left, token, *, precedence): | |
then_arm = parse_expression(parser, precedence) | |
token = parser.consume() | |
assert token.type == ':', token | |
else_arm = parse_expression(parser, precedence) | |
return ConditionalExpression(left, then_arm, else_arm) | |
class Expression: | |
... | |
@dataclass | |
class ConditionalExpression(Expression): | |
condition: Expression | |
then_arm: Expression | |
else_arm: Expression | |
@dataclass | |
class NameExpression(Expression): | |
name: str | |
@dataclass | |
class PrefixExpression(Expression): | |
operator: str | |
operand: str | |
@dataclass | |
class OperatorExpression(Expression): | |
left: Expression | |
operator: str | |
right: Expression | |
@dataclass | |
class PostfixExpression(Expression): | |
left: Expression | |
operator: str | |
class Parser: | |
def __init__(self, tokens): | |
self.tokens = tokens | |
self.peeked = None | |
def consume(self): | |
if self.peeked is None: | |
return next(self.tokens) | |
token = self.peeked | |
self.peeked = None | |
return token | |
def look_ahead(self): | |
if self.peeked is None: | |
self.peeked = next(self.tokens) | |
return self.peeked | |
def lex(text): | |
i = 0 | |
while i < len(text): | |
char = text[i] | |
i += 1 | |
if char in PUNCTUATORS: | |
yield Token(char, char) | |
elif char.isidentifier(): | |
start = i - 1 | |
while i < len(text): | |
if not text[i].isalnum(): | |
break | |
i += 1 | |
name = text[start:i] | |
assert name.isidentifier(), name | |
yield Token(NAME, name) | |
yield Token(EOF, "") | |
PRECEDENCES = { | |
'=': 1, | |
'?': 2, | |
'+': 3, | |
'-': 3, | |
'*': 4, | |
'/': 4, | |
'^': 5, | |
'pre': 6, | |
'!': 7, | |
'call': 8, | |
} | |
def add_precedence(parse_expr, op): | |
def parse(*args): | |
return parse_expr(*args, precedence=value) | |
parse.precedence = value = PRECEDENCES[op] | |
return parse | |
NAME = 'NAME' | |
EOF = 'EOF' | |
PUNCTUATORS = '(),=+-*/^~!?:' | |
PREFIX_OPS = list('+-~!') | |
BINARY_OPS = list('^*/+-=') | |
def make_prefix_parsers(): | |
parsers = {} | |
parsers[NAME] = parse_name | |
for op in PREFIX_OPS: | |
parsers[op] = add_precedence(parse_prefix_expression, 'pre') | |
return parsers | |
def make_infix_parsers(): | |
parsers = {} | |
for op in BINARY_OPS: | |
parsers[op] = add_precedence(parse_binary_expression, op) | |
parsers['!'] = add_precedence(parse_postfix_expression, '!') | |
parsers['?'] = add_precedence(parse_conditional_expression, '?') | |
return parsers | |
PREFIX_PARSERS = make_prefix_parsers() | |
INFIX_PARSERS = make_infix_parsers() | |
@dataclass | |
class Token: | |
type: str | |
value: str | |
class ParseError(Exception): | |
... | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment