MegaLoler · September 18, 2020 12:46
diff --git a/parser.py b/parser.py
 #!/bin/env python3

 # assignment #1: create a tokenizer that can read basic variables, operators, numbers, and parenthesis
 # assignment #2: set up node structures for parsing

 from dataclasses import dataclass
 import io
 import re

 @dataclass
 class Token:
    kind:  str
    value: str

 @dataclass
 class TokenPattern:
    kind:    str
    pattern: str

 @dataclass
 class Node:
    kind:     str
    children: list

 @dataclass
 class NodePattern:
    kind:          str
    sequence_type: str
    sequence:      list

 class ParseError(Exception):
    """Raised upon failure to parse."""
    pass

 token_patterns = [
    TokenPattern('whitespace',        r'^\s+$'),
    TokenPattern('left parenthesis',  r'^\($'),
    TokenPattern('right parenthesis', r'^\)$'),
    TokenPattern('add',               r'^\+$'),
    TokenPattern('subtract',          r'^\-$'),
    TokenPattern('multiply',          r'^\*$'),
    TokenPattern('divide',            r'^\/$'),
    TokenPattern('exponentiate',      r'^\^$'),
    TokenPattern('modulo',            r'^\%$'),
    TokenPattern('number',            r'^[0-9]+$'),
    TokenPattern('variable',          r'^[_a-zA-Z][_a-zA-Z0-9]*$'),
 ]

 node_patterns = [
    NodePattern('operator',   'union',    ['add', 'subtract', 'multiply', 'divide', 'exponentiate', 'modulo']),
    NodePattern('expression', 'union',    ['binary', 'value', 'grouping']),
    NodePattern('value',      'union',    ['number', 'variable']),
    NodePattern('binary',     'sequence', ['value', 'operator', 'expression']),
    NodePattern('grouping',   'sequence', ['left parenthesis', 'expression', 'right parenthesis']),
 ]

 token_patterns_dict = {token.kind:   token   for token   in token_patterns}
 node_patterns_dict  = {pattern.kind: pattern for pattern in node_patterns}

 def read_token(stream) -> Token:
    """Consume a single token from a string."""

    # iterate all the patterns until finding a match
    for pattern in token_patterns:

        # save the current position
        position = stream.tell()

        # consume until the pattern is not matched to find longest valid match
        string = ''
        valid_match = None
        while char := stream.read(1):
            string += char
            match = re.match(pattern.pattern, string)
            if not match:
                break
            else:
                position = stream.tell()
                valid_match = match

        # seek back to the starting position
        stream.seek(position)

        # if found a match, return that as a token
        if valid_match:
            return Token(pattern.kind, valid_match.string)

 def tokenize(stream) -> list:
    """Tokenize a string from a stream, returning a list of tokens."""

    # QUESTION: is there a one-line way to do this in python?
    tokens = []
    while token := read_token(stream):
        tokens.append(token)

    return tokens

 def tokenize_string(string: str) -> list:
    """Tokenize a string, returning aa list of tokens."""
    
    with io.StringIO(string) as stream:
        return tokenize(stream)

 def parse_tokens(tokens: list, kind='expression') -> (Node, list):
    """Parse a sequence of tokens into a node tree. Returns the parsed node and the rest of the token list after tokens have been consumed."""

    # ignore all whitespace tokens
    while tokens and tokens[0].kind == 'whitespace':
        tokens = tokens[1:]

    # make sure we haven't run out of tokens!
    if len(tokens) == 0:
        raise ParseError('Expected token but got nothing')

    # if the requested kind is a token kind then the next token must be of that kind
    if kind in token_patterns_dict:
        if tokens[0].kind == kind:
            return tokens[0], tokens[1:]
        else:
            raise ParseError(f'Expected token kind "{kind}" but got {tokens[0]}')

    # get the pattern to be parsed from the tokens
    pattern = node_patterns_dict[kind]

    if pattern.sequence_type == 'union':
        for kind in pattern.sequence:
            try:
                return parse_tokens(tokens, kind)
            except ParseError:
                pass
        s = ', '.join(pattern.sequence)
        raise ParseError(f'Expected node kind of "{s}" but got {tokens[0]}')

    elif pattern.sequence_type == 'sequence':
        children = []
        for kind_ in pattern.sequence:
            node, tokens = parse_tokens(tokens, kind_)
            children.append(node)
        return Node(kind, children), tokens

    else:
        raise ParseError(f'Invalid sequence type')

 def parse_all_tokens(tokens: list, kind='expression') -> list:
    """Parses all of the given tokens into a list of nodes."""

    nodes = []
    while tokens:
        node, tokens = parse_tokens(tokens, kind)
        nodes.append(node)

    return nodes

 def parse_string(string: str) -> Node:
    """Parse a string into a node tree."""

    return parse_all_tokens(tokenize_string(string))

 # test it out
 print(parse_string("5 + (5-6)"))
	#!/bin/env python3

	# assignment #1: create a tokenizer that can read basic variables, operators, numbers, and parenthesis
	# assignment #2: set up node structures for parsing

	from dataclasses import dataclass
	import io
	import re

	@dataclass
	class Token:
	kind: str
	value: str

	@dataclass
	class TokenPattern:
	kind: str
	pattern: str

	@dataclass
	class Node:
	kind: str
	children: list

	@dataclass
	class NodePattern:
	kind: str
	sequence_type: str
	sequence: list

	class ParseError(Exception):
	"""Raised upon failure to parse."""
	pass

	token_patterns = [
	TokenPattern('whitespace', r'^\s+$'),
	TokenPattern('left parenthesis', r'^\($'),
	TokenPattern('right parenthesis', r'^\)$'),
	TokenPattern('add', r'^\+$'),
	TokenPattern('subtract', r'^\-$'),
	TokenPattern('multiply', r'^\*$'),
	TokenPattern('divide', r'^\/$'),
	TokenPattern('exponentiate', r'^\^$'),
	TokenPattern('modulo', r'^\%$'),
	TokenPattern('number', r'^[0-9]+$'),
	TokenPattern('variable', r'^[_a-zA-Z][_a-zA-Z0-9]*$'),
	]

	node_patterns = [
	NodePattern('operator', 'union', ['add', 'subtract', 'multiply', 'divide', 'exponentiate', 'modulo']),
	NodePattern('expression', 'union', ['binary', 'value', 'grouping']),
	NodePattern('value', 'union', ['number', 'variable']),
	NodePattern('binary', 'sequence', ['value', 'operator', 'expression']),
	NodePattern('grouping', 'sequence', ['left parenthesis', 'expression', 'right parenthesis']),
	]

	token_patterns_dict = {token.kind: token for token in token_patterns}
	node_patterns_dict = {pattern.kind: pattern for pattern in node_patterns}

	def read_token(stream) -> Token:
	"""Consume a single token from a string."""

	# iterate all the patterns until finding a match
	for pattern in token_patterns:

	# save the current position
	position = stream.tell()

	# consume until the pattern is not matched to find longest valid match
	string = ''
	valid_match = None
	while char := stream.read(1):
	string += char
	match = re.match(pattern.pattern, string)
	if not match:
	break
	else:
	position = stream.tell()
	valid_match = match

	# seek back to the starting position
	stream.seek(position)

	# if found a match, return that as a token
	if valid_match:
	return Token(pattern.kind, valid_match.string)

	def tokenize(stream) -> list:
	"""Tokenize a string from a stream, returning a list of tokens."""

	# QUESTION: is there a one-line way to do this in python?
	tokens = []
	while token := read_token(stream):
	tokens.append(token)

	return tokens

	def tokenize_string(string: str) -> list:
	"""Tokenize a string, returning aa list of tokens."""

	with io.StringIO(string) as stream:
	return tokenize(stream)

	def parse_tokens(tokens: list, kind='expression') -> (Node, list):
	"""Parse a sequence of tokens into a node tree. Returns the parsed node and the rest of the token list after tokens have been consumed."""

	# ignore all whitespace tokens
	while tokens and tokens[0].kind == 'whitespace':
	tokens = tokens[1:]

	# make sure we haven't run out of tokens!
	if len(tokens) == 0:
	raise ParseError('Expected token but got nothing')

	# if the requested kind is a token kind then the next token must be of that kind
	if kind in token_patterns_dict:
	if tokens[0].kind == kind:
	return tokens[0], tokens[1:]
	else:
	raise ParseError(f'Expected token kind "{kind}" but got {tokens[0]}')

	# get the pattern to be parsed from the tokens
	pattern = node_patterns_dict[kind]

	if pattern.sequence_type == 'union':
	for kind in pattern.sequence:
	try:
	return parse_tokens(tokens, kind)
	except ParseError:
	pass
	s = ', '.join(pattern.sequence)
	raise ParseError(f'Expected node kind of "{s}" but got {tokens[0]}')

	elif pattern.sequence_type == 'sequence':
	children = []
	for kind_ in pattern.sequence:
	node, tokens = parse_tokens(tokens, kind_)
	children.append(node)
	return Node(kind, children), tokens

	else:
	raise ParseError(f'Invalid sequence type')

	def parse_all_tokens(tokens: list, kind='expression') -> list:
	"""Parses all of the given tokens into a list of nodes."""

	nodes = []
	while tokens:
	node, tokens = parse_tokens(tokens, kind)
	nodes.append(node)

	return nodes

	def parse_string(string: str) -> Node:
	"""Parse a string into a node tree."""

	return parse_all_tokens(tokenize_string(string))

	# test it out
	print(parse_string("5 + (5-6)"))