Created
September 18, 2020 12:46
-
-
Save MegaLoler/7d1aa5a68895bd4c0ab9611b6dbe9f56 to your computer and use it in GitHub Desktop.
assignment #2: set up node structures for parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
# assignment #1: create a tokenizer that can read basic variables, operators, numbers, and parenthesis | |
# assignment #2: set up node structures for parsing | |
from dataclasses import dataclass | |
import io | |
import re | |
@dataclass | |
class Token: | |
kind: str | |
value: str | |
@dataclass | |
class TokenPattern: | |
kind: str | |
pattern: str | |
@dataclass | |
class Node: | |
kind: str | |
children: list | |
@dataclass | |
class NodePattern: | |
kind: str | |
sequence_type: str | |
sequence: list | |
class ParseError(Exception): | |
"""Raised upon failure to parse.""" | |
pass | |
token_patterns = [ | |
TokenPattern('whitespace', r'^\s+$'), | |
TokenPattern('left parenthesis', r'^\($'), | |
TokenPattern('right parenthesis', r'^\)$'), | |
TokenPattern('add', r'^\+$'), | |
TokenPattern('subtract', r'^\-$'), | |
TokenPattern('multiply', r'^\*$'), | |
TokenPattern('divide', r'^\/$'), | |
TokenPattern('exponentiate', r'^\^$'), | |
TokenPattern('modulo', r'^\%$'), | |
TokenPattern('number', r'^[0-9]+$'), | |
TokenPattern('variable', r'^[_a-zA-Z][_a-zA-Z0-9]*$'), | |
] | |
node_patterns = [ | |
NodePattern('operator', 'union', ['add', 'subtract', 'multiply', 'divide', 'exponentiate', 'modulo']), | |
NodePattern('expression', 'union', ['binary', 'value', 'grouping']), | |
NodePattern('value', 'union', ['number', 'variable']), | |
NodePattern('binary', 'sequence', ['value', 'operator', 'expression']), | |
NodePattern('grouping', 'sequence', ['left parenthesis', 'expression', 'right parenthesis']), | |
] | |
token_patterns_dict = {token.kind: token for token in token_patterns} | |
node_patterns_dict = {pattern.kind: pattern for pattern in node_patterns} | |
def read_token(stream) -> Token: | |
"""Consume a single token from a string.""" | |
# iterate all the patterns until finding a match | |
for pattern in token_patterns: | |
# save the current position | |
position = stream.tell() | |
# consume until the pattern is not matched to find longest valid match | |
string = '' | |
valid_match = None | |
while char := stream.read(1): | |
string += char | |
match = re.match(pattern.pattern, string) | |
if not match: | |
break | |
else: | |
position = stream.tell() | |
valid_match = match | |
# seek back to the starting position | |
stream.seek(position) | |
# if found a match, return that as a token | |
if valid_match: | |
return Token(pattern.kind, valid_match.string) | |
def tokenize(stream) -> list: | |
"""Tokenize a string from a stream, returning a list of tokens.""" | |
# QUESTION: is there a one-line way to do this in python? | |
tokens = [] | |
while token := read_token(stream): | |
tokens.append(token) | |
return tokens | |
def tokenize_string(string: str) -> list: | |
"""Tokenize a string, returning aa list of tokens.""" | |
with io.StringIO(string) as stream: | |
return tokenize(stream) | |
def parse_tokens(tokens: list, kind='expression') -> (Node, list): | |
"""Parse a sequence of tokens into a node tree. Returns the parsed node and the rest of the token list after tokens have been consumed.""" | |
# ignore all whitespace tokens | |
while tokens and tokens[0].kind == 'whitespace': | |
tokens = tokens[1:] | |
# make sure we haven't run out of tokens! | |
if len(tokens) == 0: | |
raise ParseError('Expected token but got nothing') | |
# if the requested kind is a token kind then the next token must be of that kind | |
if kind in token_patterns_dict: | |
if tokens[0].kind == kind: | |
return tokens[0], tokens[1:] | |
else: | |
raise ParseError(f'Expected token kind "{kind}" but got {tokens[0]}') | |
# get the pattern to be parsed from the tokens | |
pattern = node_patterns_dict[kind] | |
if pattern.sequence_type == 'union': | |
for kind in pattern.sequence: | |
try: | |
return parse_tokens(tokens, kind) | |
except ParseError: | |
pass | |
s = ', '.join(pattern.sequence) | |
raise ParseError(f'Expected node kind of "{s}" but got {tokens[0]}') | |
elif pattern.sequence_type == 'sequence': | |
children = [] | |
for kind_ in pattern.sequence: | |
node, tokens = parse_tokens(tokens, kind_) | |
children.append(node) | |
return Node(kind, children), tokens | |
else: | |
raise ParseError(f'Invalid sequence type') | |
def parse_all_tokens(tokens: list, kind='expression') -> list: | |
"""Parses all of the given tokens into a list of nodes.""" | |
nodes = [] | |
while tokens: | |
node, tokens = parse_tokens(tokens, kind) | |
nodes.append(node) | |
return nodes | |
def parse_string(string: str) -> Node: | |
"""Parse a string into a node tree.""" | |
return parse_all_tokens(tokenize_string(string)) | |
# test it out | |
print(parse_string("5 + (5-6)")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment