Skip to content

Instantly share code, notes, and snippets.

@io-mi
Created April 13, 2022 09:03
Show Gist options
  • Save io-mi/35f016cff11f5d978a32a1107d53ae23 to your computer and use it in GitHub Desktop.
Save io-mi/35f016cff11f5d978a32a1107d53ae23 to your computer and use it in GitHub Desktop.
Simple Lexer & Parser in Python
from typing import Iterator, Match, TypeVar, Type
from dataclasses import dataclass, field
from enum import Enum
import re
class TokenKind(int, Enum):
NUMBER = 0x01
SYMBOL = 0x02
ASSIGN = 0x03
POW = 0x04
ADD = 0x05
SUB = 0x06
MUL = 0x07
DIV = 0x08
COMMA = 0x09
LPAREN = 0x0a
RPAREN = 0x0b
SEMICOLON = 0x0c
NEWLINE = 0x0d
EOF = 0x0e
T = TypeVar('T', bound="Token")
@dataclass(slots=True, frozen=True)
class Token:
kind: TokenKind
line: int = field(repr=False)
head: int = field(repr=False)
tail: int = field(repr=False)
orig: str = field(repr=False)
@classmethod
def from_match(cls:Type[T], line:int, match:Match) -> T:
return cls(TokenKind(match.lastindex), line, match.start(), match.end(), match.group())
class TokenIter:
def __init__(self, matches:Iterator[Match[str]]):
self.matches = matches
self.line = 0
self.peek = Token.from_match(self.line, next(self.matches))
def __iter__(self) -> Iterator[Token]:
return self
def __next__(self) -> Token:
if self.peek.kind is TokenKind.NEWLINE:
self.line += 1
token, self.peek = self.peek, Token.from_match(self.line, next(self.matches))
return token
class Lexer:
pattern = re.compile("|".join((
"([0-9]+\.[0-9]+|[0-9]+)", # NUMBER
"([\_a-zA-Z]+[0-9]*[\_a-zA-Z]*)", # SYMBOL
"(\=)", # ASSIGN
"(\*\*)", # POW
"(\+)", # ADD
"(\-)", # SUB
"(\*)", # MUL
"(\/)", # DIV
"(\,)", # COMMA
"(\()", # LPARAN
"(\))", # RPARAN
"(\;)", # SEMICOLON
"(\n)", # NEWLINE
"($)", # EOF
)))
@staticmethod
def tokenize(source:str) -> Iterator[Token]:
return TokenIter(Lexer.pattern.finditer(source))
@dataclass(slots=True, frozen=True)
class Node:
pass
@dataclass(slots=True, frozen=True)
class Tree(Node):
nodes: tuple[Node]
@dataclass(slots=True, frozen=True)
class Number(Node):
value: Token = field(repr=False)
@dataclass(slots=True, frozen=True)
class Symbol(Node):
name: Token = field(repr=False)
@dataclass(slots=True, frozen=True)
class Assign(Node):
symbol: Symbol
node: Node
@dataclass(slots=True, frozen=True)
class UnaryOp(Node):
operator: Token
operand: Node
@dataclass(slots=True, frozen=True)
class BinaryOp(Node):
operator: Token
operandl: Node
operandr: Node
class Parser:
@staticmethod
def source(node:Node) -> str:
src = ""
if isinstance(node, Tree):
src += ';'.join(Parser.source(n) for n in node.nodes)
src += ';'
elif isinstance(node, Number):
src += node.value.orig
elif isinstance(node, Symbol):
src += node.name.orig
elif isinstance(node, Assign):
src += "{0} = {1}".format(
Parser.source(node.symbol),
Parser.source(node.node))
elif isinstance(node, UnaryOp):
src += " {0}{1}".format(node.operator.orig,
Parser.source(node.operand))
elif isinstance(node, BinaryOp):
src += "({1} {0} {2})".format(node.operator.orig,
Parser.source(node.operandl),
Parser.source(node.operandr))
return src
@staticmethod
def tree(source:str) -> Tree:
# ter ::= { add ";" }
# add ::= mul { aso mul }
# mul ::= pow { mdo pow }
# pow ::= [ aso ] pow | aux [ "**" pow ]
# aux ::= num | sym [ "=" add ] | "(" add ")" | "\n" add
#
# aso ::= "+" | "-"
# mdo ::= "*" | "/"
tokens:Iterator[Token] = Lexer.tokenize(source)
tree = Parser._ter(tokens)
assert tokens.peek.kind is TokenKind.EOF,\
"Unexpected token {0}, line {1}"\
.format(tokens.peek.kind.name, tokens.peek.line)
return Tree(tuple(tree))
@staticmethod
def _ter(tokens:Iterator[Token]) -> Node:
tree = list()
while tokens.peek.kind is not TokenKind.EOF:
tree.append(Parser._add(tokens))
assert tokens.peek.kind is TokenKind.SEMICOLON,\
"Expected ';', line {0}".format(tokens.peek.line)
next(tokens) # semicolon
return tree
@staticmethod
def _add(tokens:Iterator[Token]) -> Node:
node = Parser._mul(tokens)
while tokens.peek.kind in {TokenKind.ADD, TokenKind.SUB}:
node = BinaryOp(next(tokens), node, Parser._mul(tokens))
return node
@staticmethod
def _mul(tokens:Iterator[Token]) -> Node:
node = Parser._pow(tokens)
while tokens.peek.kind in {TokenKind.MUL, TokenKind.DIV}:
node = BinaryOp(next(tokens), node, Parser._pow(tokens))
return node
@staticmethod
def _pow(tokens:Iterator[Token]) -> Node:
if tokens.peek.kind in {TokenKind.ADD, TokenKind.SUB}:
return UnaryOp(next(tokens), Parser._pow(tokens))
node = Parser._aux(tokens)
if tokens.peek.kind is TokenKind.POW:
node = BinaryOp(next(tokens), node, Parser._pow(tokens))
return node
@staticmethod
def _aux(tokens:Iterator[Token]) -> Node:
if tokens.peek.kind is TokenKind.NUMBER:
return Number(next(tokens))
if tokens.peek.kind is TokenKind.SYMBOL:
node = Symbol(next(tokens))
if tokens.peek.kind == TokenKind.ASSIGN:
next(tokens)# assign
node = Assign(node, Parser._add(tokens))
return node
if tokens.peek.kind is TokenKind.LPAREN :
next(tokens) # lparen
node = Parser._add(tokens)
assert tokens.peek.kind is TokenKind.RPAREN ,\
"Expected ')', line {0}".format(tokens.peek.line)
next(tokens) # rparen
return node
if tokens.peek.kind is TokenKind.NEWLINE:
next(tokens) # newline
return Parser._add(tokens)
raise SyntaxError("Unexpected token {0}, line {1}"\
.format(tokens.peek.kind.name,tokens.peek.line))
if __name__ == "__main__":
tree = Parser.tree("a * (b + c) ** -(d - e);")
print(tree)
print(Parser.source(tree))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment