Skip to content

Instantly share code, notes, and snippets.

@jorektheglitch
Last active September 17, 2025 19:30
Show Gist options
  • Save jorektheglitch/20ae5e581fc671612b4da80f57a6fae5 to your computer and use it in GitHub Desktop.
Save jorektheglitch/20ae5e581fc671612b4da80f57a6fae5 to your computer and use it in GitHub Desktop.
Simple tokenizer for (extended) math expressions
from abc import ABC
from dataclasses import dataclass
from enum import Enum
from string import ascii_letters, digits
from typing import List, Optional, Union
DELIMITER = Enum("DELIMITER", "COMMA SEMICOLON")
DELIMITERS = {
",": DELIMITER.COMMA,
";": DELIMITER.SEMICOLON
}
DELIMITER_CHARS = (" ", ",", ";", "\n")
OP_TOKEN = Enum(
"OP_TOKEN",
"EQUALS "
# =
"PLUS HYPHEN ASTERISK SLASH CARET "
# + - * / ^
"UNARY_PLUS UNARY_HYPHEN",
)
PAREN = Enum("PAREN", "OPEN CLOSE")
PARENS = {
"(": PAREN.OPEN,
")": PAREN.CLOSE
}
OPERATIONS = {
"=": OP_TOKEN.EQUALS,
"+": OP_TOKEN.PLUS,
"-": OP_TOKEN.HYPHEN,
"*": OP_TOKEN.ASTERISK,
"/": OP_TOKEN.SLASH,
"^": OP_TOKEN.CARET,
}
UNARIES = {
OP_TOKEN.PLUS: OP_TOKEN.UNARY_PLUS,
OP_TOKEN.HYPHEN: OP_TOKEN.UNARY_HYPHEN
}
class TokenizationError(Exception):
line: Optional[int]
position: Optional[int]
def __init__(self, msg=None, line=None, position=None):
args = (msg,) if msg else ()
super().__init__(*args)
self.line = line
self.position = position
def __str__(self):
msg = super().__str__()
coords = {"line": self.line, "position": self.position}
coords = {k: v for k, v in coords.items() if v is not None}
if coords:
coords_info = " ".join(
f"{name} {value}" for name, value in coords.items()
)
msg = f"({coords_info}) {msg}"
return msg
@dataclass
class Token(ABC):
line: int
position: int
@dataclass
class Operation(Token):
op: OP_TOKEN
@dataclass
class Parenthesis(Token):
value: PAREN
@dataclass
class Delimiter(Token):
value: DELIMITER
@dataclass
class Number(Token):
value: Union[int, float]
@dataclass
class Variable(Token):
name: str
def augmented(string: str):
line = 0
position = 0
for char in string:
yield line, position, char
if char == "\n":
line += 1
position = 0
else:
position += 1
def tokenize(string: str):
tokens: List[Token] = []
buffer = ""
append_buffer: bool = False
delimiter = None
token: Optional[Token] = None
last_token: Optional[Token] = None
for line, position, char in augmented(string):
if char in DELIMITER_CHARS:
delimiter = DELIMITERS.get(char)
append_buffer = True
elif char in digits:
buffer += char
elif char in ascii_letters:
buffer += char
elif char == ".":
if buffer:
if buffer[0] in ascii_letters:
raise TokenizationError(
"Unexpected '.' in name",
line, position
)
elif "." in buffer:
raise TokenizationError(
"Doubled '.' in digit",
line, position
)
buffer += char
elif char in OPERATIONS:
append_buffer = True
op_token = OPERATIONS[char]
token = Operation(line, position, op_token)
elif char in PARENS:
append_buffer = True
paren = PARENS[char]
token = Parenthesis(line, position, paren)
else:
raise TokenizationError(
f"Unknown {repr(char)} symbol.",
line, position
)
buffer = buffer.strip()
if append_buffer:
if delimiter is not None:
tokens.append(Delimiter(line, position, delimiter))
delimiter = None
if buffer:
parsed: Token
pos = position - len(buffer)
starter = buffer[0]
if "." in buffer:
n = float(buffer)
parsed = Number(line, pos, n)
elif starter in digits:
n = int(buffer)
parsed = Number(line, pos, n)
elif starter in ascii_letters:
parsed = Variable(line, pos, buffer)
tokens.append(parsed)
last_token = parsed
buffer = ""
append_buffer = False
if isinstance(token, Operation):
if isinstance(last_token, Operation) or last_token is None:
token_op = token.op
if token_op not in UNARIES:
raise TokenizationError(
f"Unexpected {repr(char)} operator",
token.line, token.position
)
token.op = UNARIES[op_token]
if token is not None:
tokens.append(token)
last_token = token
token = None
return tokens
test_str = """
a = 222+.2*-2.2
b(f1, f2) = f1()+f2
b(() = 0, 0)
c = -+-+--++2
"""
print("Test string:", test_str, "\n")
print("Tokens:")
print(*tokenize(test_str), sep="\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment