Skip to content

Instantly share code, notes, and snippets.

@MegaLoler
Created September 12, 2020 14:00
Show Gist options
  • Save MegaLoler/006716b4a99f1a2a1238b453b1bd760d to your computer and use it in GitHub Desktop.
Save MegaLoler/006716b4a99f1a2a1238b453b1bd760d to your computer and use it in GitHub Desktop.
assignment: create a tokenizer that can read basic variables, operators, numbers, and parenthesis
#!/bin/env python3
# create a tokenizer that can read basic variables, operators, numbers, and parenthesis
from dataclasses import dataclass
import io
import re
@dataclass
class Token:
kind: str
value: str
@dataclass
class Pattern:
pattern: str
kind: str
patterns = [
Pattern(r'^\s+$', 'whitespace'),
Pattern(r'^\($', 'left_parenthesis'),
Pattern(r'^\)$', 'right_parenthesis'),
Pattern(r'^\+$', 'add'),
Pattern(r'^\-$', 'subtract'),
Pattern(r'^\*$', 'multiply'),
Pattern(r'^\/$', 'divide'),
Pattern(r'^\^$', 'exponentiate'),
Pattern(r'^\%$', 'modulo'),
Pattern(r'^[0-9]+$', 'number'),
Pattern(r'^[_a-zA-Z][_a-zA-Z0-9]*$', 'variable'),
]
def read_token(stream) -> Token:
"""Consume a single token from a string."""
# iterate all the patterns until finding a match
for pattern in patterns:
# save the current position
position = stream.tell()
# consume until the pattern is not matched to find longest valid match
string = ''
valid_match = None
while char := stream.read(1):
string += char
match = re.match(pattern.pattern, string)
if not match:
break
else:
position = stream.tell()
valid_match = match
# seek back to the starting position
stream.seek(position)
# if found a match, return that as a token
if valid_match:
return Token(pattern.kind, valid_match.string)
def tokenize(stream) -> list:
"""Tokenize a string from a stream, returning a list of tokens."""
# QUESTION: is there a one-line way to do this in python?
tokens = []
while token := read_token(stream):
tokens.append(token)
return tokens
def tokenize_string(string: str) -> list:
"""Tokenize a string, returning aa list of tokens."""
with io.StringIO(string) as stream:
return tokenize(stream)
# test it out
print(tokenize_string("58 - _hey +(6*7%018327)^1"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment