Created
September 12, 2020 14:00
-
-
Save MegaLoler/006716b4a99f1a2a1238b453b1bd760d to your computer and use it in GitHub Desktop.
assignment: create a tokenizer that can read basic variables, operators, numbers, and parenthesis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
# create a tokenizer that can read basic variables, operators, numbers, and parenthesis | |
from dataclasses import dataclass | |
import io | |
import re | |
@dataclass | |
class Token: | |
kind: str | |
value: str | |
@dataclass | |
class Pattern: | |
pattern: str | |
kind: str | |
patterns = [ | |
Pattern(r'^\s+$', 'whitespace'), | |
Pattern(r'^\($', 'left_parenthesis'), | |
Pattern(r'^\)$', 'right_parenthesis'), | |
Pattern(r'^\+$', 'add'), | |
Pattern(r'^\-$', 'subtract'), | |
Pattern(r'^\*$', 'multiply'), | |
Pattern(r'^\/$', 'divide'), | |
Pattern(r'^\^$', 'exponentiate'), | |
Pattern(r'^\%$', 'modulo'), | |
Pattern(r'^[0-9]+$', 'number'), | |
Pattern(r'^[_a-zA-Z][_a-zA-Z0-9]*$', 'variable'), | |
] | |
def read_token(stream) -> Token: | |
"""Consume a single token from a string.""" | |
# iterate all the patterns until finding a match | |
for pattern in patterns: | |
# save the current position | |
position = stream.tell() | |
# consume until the pattern is not matched to find longest valid match | |
string = '' | |
valid_match = None | |
while char := stream.read(1): | |
string += char | |
match = re.match(pattern.pattern, string) | |
if not match: | |
break | |
else: | |
position = stream.tell() | |
valid_match = match | |
# seek back to the starting position | |
stream.seek(position) | |
# if found a match, return that as a token | |
if valid_match: | |
return Token(pattern.kind, valid_match.string) | |
def tokenize(stream) -> list: | |
"""Tokenize a string from a stream, returning a list of tokens.""" | |
# QUESTION: is there a one-line way to do this in python? | |
tokens = [] | |
while token := read_token(stream): | |
tokens.append(token) | |
return tokens | |
def tokenize_string(string: str) -> list: | |
"""Tokenize a string, returning aa list of tokens.""" | |
with io.StringIO(string) as stream: | |
return tokenize(stream) | |
# test it out | |
print(tokenize_string("58 - _hey +(6*7%018327)^1")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment