-
-
Save datavudeja/0e426f3f58258f42c20317cb8c3ea4a9 to your computer and use it in GitHub Desktop.
Simple tokenizer for (extended) math expressions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from abc import ABC | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| from string import ascii_letters, digits | |
| from typing import List, Optional, Union | |
| DELIMITER = Enum("DELIMITER", "COMMA SEMICOLON") | |
| DELIMITERS = { | |
| ",": DELIMITER.COMMA, | |
| ";": DELIMITER.SEMICOLON | |
| } | |
| DELIMITER_CHARS = (" ", ",", ";", "\n") | |
| OP_TOKEN = Enum( | |
| "OP_TOKEN", | |
| "EQUALS " | |
| # = | |
| "PLUS HYPHEN ASTERISK SLASH CARET " | |
| # + - * / ^ | |
| "UNARY_PLUS UNARY_HYPHEN", | |
| ) | |
| PAREN = Enum("PAREN", "OPEN CLOSE") | |
| PARENS = { | |
| "(": PAREN.OPEN, | |
| ")": PAREN.CLOSE | |
| } | |
| OPERATIONS = { | |
| "=": OP_TOKEN.EQUALS, | |
| "+": OP_TOKEN.PLUS, | |
| "-": OP_TOKEN.HYPHEN, | |
| "*": OP_TOKEN.ASTERISK, | |
| "/": OP_TOKEN.SLASH, | |
| "^": OP_TOKEN.CARET, | |
| } | |
| UNARIES = { | |
| OP_TOKEN.PLUS: OP_TOKEN.UNARY_PLUS, | |
| OP_TOKEN.HYPHEN: OP_TOKEN.UNARY_HYPHEN | |
| } | |
| class TokenizationError(Exception): | |
| line: Optional[int] | |
| position: Optional[int] | |
| def __init__(self, msg=None, line=None, position=None): | |
| args = (msg,) if msg else () | |
| super().__init__(*args) | |
| self.line = line | |
| self.position = position | |
| def __str__(self): | |
| msg = super().__str__() | |
| coords = {"line": self.line, "position": self.position} | |
| coords = {k: v for k, v in coords.items() if v is not None} | |
| if coords: | |
| coords_info = " ".join( | |
| f"{name} {value}" for name, value in coords.items() | |
| ) | |
| msg = f"({coords_info}) {msg}" | |
| return msg | |
| @dataclass | |
| class Token(ABC): | |
| line: int | |
| position: int | |
| @dataclass | |
| class Operation(Token): | |
| op: OP_TOKEN | |
| @dataclass | |
| class Parenthesis(Token): | |
| value: PAREN | |
| @dataclass | |
| class Delimiter(Token): | |
| value: DELIMITER | |
| @dataclass | |
| class Number(Token): | |
| value: Union[int, float] | |
| @dataclass | |
| class Variable(Token): | |
| name: str | |
| def augmented(string: str): | |
| line = 0 | |
| position = 0 | |
| for char in string: | |
| yield line, position, char | |
| if char == "\n": | |
| line += 1 | |
| position = 0 | |
| else: | |
| position += 1 | |
| def tokenize(string: str): | |
| tokens: List[Token] = [] | |
| buffer = "" | |
| append_buffer: bool = False | |
| delimiter = None | |
| token: Optional[Token] = None | |
| last_token: Optional[Token] = None | |
| for line, position, char in augmented(string): | |
| if char in DELIMITER_CHARS: | |
| delimiter = DELIMITERS.get(char) | |
| append_buffer = True | |
| elif char in digits: | |
| buffer += char | |
| elif char in ascii_letters: | |
| buffer += char | |
| elif char == ".": | |
| if buffer: | |
| if buffer[0] in ascii_letters: | |
| raise TokenizationError( | |
| "Unexpected '.' in name", | |
| line, position | |
| ) | |
| elif "." in buffer: | |
| raise TokenizationError( | |
| "Doubled '.' in digit", | |
| line, position | |
| ) | |
| buffer += char | |
| elif char in OPERATIONS: | |
| append_buffer = True | |
| op_token = OPERATIONS[char] | |
| token = Operation(line, position, op_token) | |
| elif char in PARENS: | |
| append_buffer = True | |
| paren = PARENS[char] | |
| token = Parenthesis(line, position, paren) | |
| else: | |
| raise TokenizationError( | |
| f"Unknown {repr(char)} symbol.", | |
| line, position | |
| ) | |
| buffer = buffer.strip() | |
| if append_buffer: | |
| if delimiter is not None: | |
| tokens.append(Delimiter(line, position, delimiter)) | |
| delimiter = None | |
| if buffer: | |
| parsed: Token | |
| pos = position - len(buffer) | |
| starter = buffer[0] | |
| if "." in buffer: | |
| n = float(buffer) | |
| parsed = Number(line, pos, n) | |
| elif starter in digits: | |
| n = int(buffer) | |
| parsed = Number(line, pos, n) | |
| elif starter in ascii_letters: | |
| parsed = Variable(line, pos, buffer) | |
| tokens.append(parsed) | |
| last_token = parsed | |
| buffer = "" | |
| append_buffer = False | |
| if isinstance(token, Operation): | |
| if isinstance(last_token, Operation) or last_token is None: | |
| token_op = token.op | |
| if token_op not in UNARIES: | |
| raise TokenizationError( | |
| f"Unexpected {repr(char)} operator", | |
| token.line, token.position | |
| ) | |
| token.op = UNARIES[op_token] | |
| if token is not None: | |
| tokens.append(token) | |
| last_token = token | |
| token = None | |
| return tokens | |
| test_str = """ | |
| a = 222+.2*-2.2 | |
| b(f1, f2) = f1()+f2 | |
| b(() = 0, 0) | |
| c = -+-+--++2 | |
| """ | |
| print("Test string:", test_str, "\n") | |
| print("Tokens:") | |
| print(*tokenize(test_str), sep="\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment