Skip to content

Instantly share code, notes, and snippets.

@MegaLoler
Created September 18, 2020 12:46
Show Gist options
  • Save MegaLoler/7d1aa5a68895bd4c0ab9611b6dbe9f56 to your computer and use it in GitHub Desktop.
Save MegaLoler/7d1aa5a68895bd4c0ab9611b6dbe9f56 to your computer and use it in GitHub Desktop.
assignment #2: set up node structures for parsing
#!/bin/env python3
# assignment #1: create a tokenizer that can read basic variables, operators, numbers, and parenthesis
# assignment #2: set up node structures for parsing
from dataclasses import dataclass
import io
import re
@dataclass
class Token:
kind: str
value: str
@dataclass
class TokenPattern:
kind: str
pattern: str
@dataclass
class Node:
kind: str
children: list
@dataclass
class NodePattern:
kind: str
sequence_type: str
sequence: list
class ParseError(Exception):
"""Raised upon failure to parse."""
pass
token_patterns = [
TokenPattern('whitespace', r'^\s+$'),
TokenPattern('left parenthesis', r'^\($'),
TokenPattern('right parenthesis', r'^\)$'),
TokenPattern('add', r'^\+$'),
TokenPattern('subtract', r'^\-$'),
TokenPattern('multiply', r'^\*$'),
TokenPattern('divide', r'^\/$'),
TokenPattern('exponentiate', r'^\^$'),
TokenPattern('modulo', r'^\%$'),
TokenPattern('number', r'^[0-9]+$'),
TokenPattern('variable', r'^[_a-zA-Z][_a-zA-Z0-9]*$'),
]
node_patterns = [
NodePattern('operator', 'union', ['add', 'subtract', 'multiply', 'divide', 'exponentiate', 'modulo']),
NodePattern('expression', 'union', ['binary', 'value', 'grouping']),
NodePattern('value', 'union', ['number', 'variable']),
NodePattern('binary', 'sequence', ['value', 'operator', 'expression']),
NodePattern('grouping', 'sequence', ['left parenthesis', 'expression', 'right parenthesis']),
]
token_patterns_dict = {token.kind: token for token in token_patterns}
node_patterns_dict = {pattern.kind: pattern for pattern in node_patterns}
def read_token(stream) -> Token:
"""Consume a single token from a string."""
# iterate all the patterns until finding a match
for pattern in token_patterns:
# save the current position
position = stream.tell()
# consume until the pattern is not matched to find longest valid match
string = ''
valid_match = None
while char := stream.read(1):
string += char
match = re.match(pattern.pattern, string)
if not match:
break
else:
position = stream.tell()
valid_match = match
# seek back to the starting position
stream.seek(position)
# if found a match, return that as a token
if valid_match:
return Token(pattern.kind, valid_match.string)
def tokenize(stream) -> list:
"""Tokenize a string from a stream, returning a list of tokens."""
# QUESTION: is there a one-line way to do this in python?
tokens = []
while token := read_token(stream):
tokens.append(token)
return tokens
def tokenize_string(string: str) -> list:
"""Tokenize a string, returning aa list of tokens."""
with io.StringIO(string) as stream:
return tokenize(stream)
def parse_tokens(tokens: list, kind='expression') -> (Node, list):
"""Parse a sequence of tokens into a node tree. Returns the parsed node and the rest of the token list after tokens have been consumed."""
# ignore all whitespace tokens
while tokens and tokens[0].kind == 'whitespace':
tokens = tokens[1:]
# make sure we haven't run out of tokens!
if len(tokens) == 0:
raise ParseError('Expected token but got nothing')
# if the requested kind is a token kind then the next token must be of that kind
if kind in token_patterns_dict:
if tokens[0].kind == kind:
return tokens[0], tokens[1:]
else:
raise ParseError(f'Expected token kind "{kind}" but got {tokens[0]}')
# get the pattern to be parsed from the tokens
pattern = node_patterns_dict[kind]
if pattern.sequence_type == 'union':
for kind in pattern.sequence:
try:
return parse_tokens(tokens, kind)
except ParseError:
pass
s = ', '.join(pattern.sequence)
raise ParseError(f'Expected node kind of "{s}" but got {tokens[0]}')
elif pattern.sequence_type == 'sequence':
children = []
for kind_ in pattern.sequence:
node, tokens = parse_tokens(tokens, kind_)
children.append(node)
return Node(kind, children), tokens
else:
raise ParseError(f'Invalid sequence type')
def parse_all_tokens(tokens: list, kind='expression') -> list:
"""Parses all of the given tokens into a list of nodes."""
nodes = []
while tokens:
node, tokens = parse_tokens(tokens, kind)
nodes.append(node)
return nodes
def parse_string(string: str) -> Node:
"""Parse a string into a node tree."""
return parse_all_tokens(tokenize_string(string))
# test it out
print(parse_string("5 + (5-6)"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment