Last active
September 17, 2025 19:30
-
-
Save jorektheglitch/20ae5e581fc671612b4da80f57a6fae5 to your computer and use it in GitHub Desktop.
Simple tokenizer for (extended) math expressions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from abc import ABC | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| from string import ascii_letters, digits | |
| from typing import List, Optional, Union | |
| DELIMITER = Enum("DELIMITER", "COMMA SEMICOLON") | |
| DELIMITERS = { | |
| ",": DELIMITER.COMMA, | |
| ";": DELIMITER.SEMICOLON | |
| } | |
| DELIMITER_CHARS = (" ", ",", ";", "\n") | |
| OP_TOKEN = Enum( | |
| "OP_TOKEN", | |
| "EQUALS " | |
| # = | |
| "PLUS HYPHEN ASTERISK SLASH CARET " | |
| # + - * / ^ | |
| "UNARY_PLUS UNARY_HYPHEN", | |
| ) | |
| PAREN = Enum("PAREN", "OPEN CLOSE") | |
| PARENS = { | |
| "(": PAREN.OPEN, | |
| ")": PAREN.CLOSE | |
| } | |
| OPERATIONS = { | |
| "=": OP_TOKEN.EQUALS, | |
| "+": OP_TOKEN.PLUS, | |
| "-": OP_TOKEN.HYPHEN, | |
| "*": OP_TOKEN.ASTERISK, | |
| "/": OP_TOKEN.SLASH, | |
| "^": OP_TOKEN.CARET, | |
| } | |
| UNARIES = { | |
| OP_TOKEN.PLUS: OP_TOKEN.UNARY_PLUS, | |
| OP_TOKEN.HYPHEN: OP_TOKEN.UNARY_HYPHEN | |
| } | |
| class TokenizationError(Exception): | |
| line: Optional[int] | |
| position: Optional[int] | |
| def __init__(self, msg=None, line=None, position=None): | |
| args = (msg,) if msg else () | |
| super().__init__(*args) | |
| self.line = line | |
| self.position = position | |
| def __str__(self): | |
| msg = super().__str__() | |
| coords = {"line": self.line, "position": self.position} | |
| coords = {k: v for k, v in coords.items() if v is not None} | |
| if coords: | |
| coords_info = " ".join( | |
| f"{name} {value}" for name, value in coords.items() | |
| ) | |
| msg = f"({coords_info}) {msg}" | |
| return msg | |
| @dataclass | |
| class Token(ABC): | |
| line: int | |
| position: int | |
| @dataclass | |
| class Operation(Token): | |
| op: OP_TOKEN | |
| @dataclass | |
| class Parenthesis(Token): | |
| value: PAREN | |
| @dataclass | |
| class Delimiter(Token): | |
| value: DELIMITER | |
| @dataclass | |
| class Number(Token): | |
| value: Union[int, float] | |
| @dataclass | |
| class Variable(Token): | |
| name: str | |
| def augmented(string: str): | |
| line = 0 | |
| position = 0 | |
| for char in string: | |
| yield line, position, char | |
| if char == "\n": | |
| line += 1 | |
| position = 0 | |
| else: | |
| position += 1 | |
| def tokenize(string: str): | |
| tokens: List[Token] = [] | |
| buffer = "" | |
| append_buffer: bool = False | |
| delimiter = None | |
| token: Optional[Token] = None | |
| last_token: Optional[Token] = None | |
| for line, position, char in augmented(string): | |
| if char in DELIMITER_CHARS: | |
| delimiter = DELIMITERS.get(char) | |
| append_buffer = True | |
| elif char in digits: | |
| buffer += char | |
| elif char in ascii_letters: | |
| buffer += char | |
| elif char == ".": | |
| if buffer: | |
| if buffer[0] in ascii_letters: | |
| raise TokenizationError( | |
| "Unexpected '.' in name", | |
| line, position | |
| ) | |
| elif "." in buffer: | |
| raise TokenizationError( | |
| "Doubled '.' in digit", | |
| line, position | |
| ) | |
| buffer += char | |
| elif char in OPERATIONS: | |
| append_buffer = True | |
| op_token = OPERATIONS[char] | |
| token = Operation(line, position, op_token) | |
| elif char in PARENS: | |
| append_buffer = True | |
| paren = PARENS[char] | |
| token = Parenthesis(line, position, paren) | |
| else: | |
| raise TokenizationError( | |
| f"Unknown {repr(char)} symbol.", | |
| line, position | |
| ) | |
| buffer = buffer.strip() | |
| if append_buffer: | |
| if delimiter is not None: | |
| tokens.append(Delimiter(line, position, delimiter)) | |
| delimiter = None | |
| if buffer: | |
| parsed: Token | |
| pos = position - len(buffer) | |
| starter = buffer[0] | |
| if "." in buffer: | |
| n = float(buffer) | |
| parsed = Number(line, pos, n) | |
| elif starter in digits: | |
| n = int(buffer) | |
| parsed = Number(line, pos, n) | |
| elif starter in ascii_letters: | |
| parsed = Variable(line, pos, buffer) | |
| tokens.append(parsed) | |
| last_token = parsed | |
| buffer = "" | |
| append_buffer = False | |
| if isinstance(token, Operation): | |
| if isinstance(last_token, Operation) or last_token is None: | |
| token_op = token.op | |
| if token_op not in UNARIES: | |
| raise TokenizationError( | |
| f"Unexpected {repr(char)} operator", | |
| token.line, token.position | |
| ) | |
| token.op = UNARIES[op_token] | |
| if token is not None: | |
| tokens.append(token) | |
| last_token = token | |
| token = None | |
| return tokens | |
| test_str = """ | |
| a = 222+.2*-2.2 | |
| b(f1, f2) = f1()+f2 | |
| b(() = 0, 0) | |
| c = -+-+--++2 | |
| """ | |
| print("Test string:", test_str, "\n") | |
| print("Tokens:") | |
| print(*tokenize(test_str), sep="\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment