Created
April 13, 2022 09:03
-
-
Save io-mi/35f016cff11f5d978a32a1107d53ae23 to your computer and use it in GitHub Desktop.
Simple Lexer & Parser in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Iterator, Match, TypeVar, Type | |
from dataclasses import dataclass, field | |
from enum import Enum | |
import re | |
class TokenKind(int, Enum): | |
NUMBER = 0x01 | |
SYMBOL = 0x02 | |
ASSIGN = 0x03 | |
POW = 0x04 | |
ADD = 0x05 | |
SUB = 0x06 | |
MUL = 0x07 | |
DIV = 0x08 | |
COMMA = 0x09 | |
LPAREN = 0x0a | |
RPAREN = 0x0b | |
SEMICOLON = 0x0c | |
NEWLINE = 0x0d | |
EOF = 0x0e | |
T = TypeVar('T', bound="Token") | |
@dataclass(slots=True, frozen=True) | |
class Token: | |
kind: TokenKind | |
line: int = field(repr=False) | |
head: int = field(repr=False) | |
tail: int = field(repr=False) | |
orig: str = field(repr=False) | |
@classmethod | |
def from_match(cls:Type[T], line:int, match:Match) -> T: | |
return cls(TokenKind(match.lastindex), line, match.start(), match.end(), match.group()) | |
class TokenIter: | |
def __init__(self, matches:Iterator[Match[str]]): | |
self.matches = matches | |
self.line = 0 | |
self.peek = Token.from_match(self.line, next(self.matches)) | |
def __iter__(self) -> Iterator[Token]: | |
return self | |
def __next__(self) -> Token: | |
if self.peek.kind is TokenKind.NEWLINE: | |
self.line += 1 | |
token, self.peek = self.peek, Token.from_match(self.line, next(self.matches)) | |
return token | |
class Lexer: | |
pattern = re.compile("|".join(( | |
"([0-9]+\.[0-9]+|[0-9]+)", # NUMBER | |
"([\_a-zA-Z]+[0-9]*[\_a-zA-Z]*)", # SYMBOL | |
"(\=)", # ASSIGN | |
"(\*\*)", # POW | |
"(\+)", # ADD | |
"(\-)", # SUB | |
"(\*)", # MUL | |
"(\/)", # DIV | |
"(\,)", # COMMA | |
"(\()", # LPARAN | |
"(\))", # RPARAN | |
"(\;)", # SEMICOLON | |
"(\n)", # NEWLINE | |
"($)", # EOF | |
))) | |
@staticmethod | |
def tokenize(source:str) -> Iterator[Token]: | |
return TokenIter(Lexer.pattern.finditer(source)) | |
@dataclass(slots=True, frozen=True) | |
class Node: | |
pass | |
@dataclass(slots=True, frozen=True) | |
class Tree(Node): | |
nodes: tuple[Node] | |
@dataclass(slots=True, frozen=True) | |
class Number(Node): | |
value: Token = field(repr=False) | |
@dataclass(slots=True, frozen=True) | |
class Symbol(Node): | |
name: Token = field(repr=False) | |
@dataclass(slots=True, frozen=True) | |
class Assign(Node): | |
symbol: Symbol | |
node: Node | |
@dataclass(slots=True, frozen=True) | |
class UnaryOp(Node): | |
operator: Token | |
operand: Node | |
@dataclass(slots=True, frozen=True) | |
class BinaryOp(Node): | |
operator: Token | |
operandl: Node | |
operandr: Node | |
class Parser: | |
@staticmethod | |
def source(node:Node) -> str: | |
src = "" | |
if isinstance(node, Tree): | |
src += ';'.join(Parser.source(n) for n in node.nodes) | |
src += ';' | |
elif isinstance(node, Number): | |
src += node.value.orig | |
elif isinstance(node, Symbol): | |
src += node.name.orig | |
elif isinstance(node, Assign): | |
src += "{0} = {1}".format( | |
Parser.source(node.symbol), | |
Parser.source(node.node)) | |
elif isinstance(node, UnaryOp): | |
src += " {0}{1}".format(node.operator.orig, | |
Parser.source(node.operand)) | |
elif isinstance(node, BinaryOp): | |
src += "({1} {0} {2})".format(node.operator.orig, | |
Parser.source(node.operandl), | |
Parser.source(node.operandr)) | |
return src | |
@staticmethod | |
def tree(source:str) -> Tree: | |
# ter ::= { add ";" } | |
# add ::= mul { aso mul } | |
# mul ::= pow { mdo pow } | |
# pow ::= [ aso ] pow | aux [ "**" pow ] | |
# aux ::= num | sym [ "=" add ] | "(" add ")" | "\n" add | |
# | |
# aso ::= "+" | "-" | |
# mdo ::= "*" | "/" | |
tokens:Iterator[Token] = Lexer.tokenize(source) | |
tree = Parser._ter(tokens) | |
assert tokens.peek.kind is TokenKind.EOF,\ | |
"Unexpected token {0}, line {1}"\ | |
.format(tokens.peek.kind.name, tokens.peek.line) | |
return Tree(tuple(tree)) | |
@staticmethod | |
def _ter(tokens:Iterator[Token]) -> Node: | |
tree = list() | |
while tokens.peek.kind is not TokenKind.EOF: | |
tree.append(Parser._add(tokens)) | |
assert tokens.peek.kind is TokenKind.SEMICOLON,\ | |
"Expected ';', line {0}".format(tokens.peek.line) | |
next(tokens) # semicolon | |
return tree | |
@staticmethod | |
def _add(tokens:Iterator[Token]) -> Node: | |
node = Parser._mul(tokens) | |
while tokens.peek.kind in {TokenKind.ADD, TokenKind.SUB}: | |
node = BinaryOp(next(tokens), node, Parser._mul(tokens)) | |
return node | |
@staticmethod | |
def _mul(tokens:Iterator[Token]) -> Node: | |
node = Parser._pow(tokens) | |
while tokens.peek.kind in {TokenKind.MUL, TokenKind.DIV}: | |
node = BinaryOp(next(tokens), node, Parser._pow(tokens)) | |
return node | |
@staticmethod | |
def _pow(tokens:Iterator[Token]) -> Node: | |
if tokens.peek.kind in {TokenKind.ADD, TokenKind.SUB}: | |
return UnaryOp(next(tokens), Parser._pow(tokens)) | |
node = Parser._aux(tokens) | |
if tokens.peek.kind is TokenKind.POW: | |
node = BinaryOp(next(tokens), node, Parser._pow(tokens)) | |
return node | |
@staticmethod | |
def _aux(tokens:Iterator[Token]) -> Node: | |
if tokens.peek.kind is TokenKind.NUMBER: | |
return Number(next(tokens)) | |
if tokens.peek.kind is TokenKind.SYMBOL: | |
node = Symbol(next(tokens)) | |
if tokens.peek.kind == TokenKind.ASSIGN: | |
next(tokens)# assign | |
node = Assign(node, Parser._add(tokens)) | |
return node | |
if tokens.peek.kind is TokenKind.LPAREN : | |
next(tokens) # lparen | |
node = Parser._add(tokens) | |
assert tokens.peek.kind is TokenKind.RPAREN ,\ | |
"Expected ')', line {0}".format(tokens.peek.line) | |
next(tokens) # rparen | |
return node | |
if tokens.peek.kind is TokenKind.NEWLINE: | |
next(tokens) # newline | |
return Parser._add(tokens) | |
raise SyntaxError("Unexpected token {0}, line {1}"\ | |
.format(tokens.peek.kind.name,tokens.peek.line)) | |
if __name__ == "__main__": | |
tree = Parser.tree("a * (b + c) ** -(d - e);") | |
print(tree) | |
print(Parser.source(tree)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment