Skip to content

Instantly share code, notes, and snippets.

@danesherbs
Created March 25, 2023 05:25
Show Gist options
  • Save danesherbs/a57a91a6c5538ddd9e0801fc3e217423 to your computer and use it in GitHub Desktop.
Save danesherbs/a57a91a6c5538ddd9e0801fc3e217423 to your computer and use it in GitHub Desktop.
A tiny compiler for a Ruby-like language
"""A teeny tiny compiler!"""
import re
from typing import List, Tuple
from dataclasses import dataclass
TOKEN_TYPES = [
("def", r"\bdef\b"),
("end", r"\bend\b"),
("identifier", r"\b[a-zA-Z]+\b"),
("integer", r"\b[0-9]+\b"),
("oparen", r"\("),
("cparen", r"\)"),
("comma", r",")
]
@dataclass
class Token:
type: str
value: str
@dataclass
class DefNode:
name: str
arg_names: List[str]
body: any
@dataclass
class IntegerNode:
value: int
class Tokenizer:
def __init__(self, code: str):
self.code = code
def tokenize(self) -> List[Token]:
tokens = []
while self.code:
token = self.tokenize_one_token()
self.code = self.code.strip()
tokens.append(token)
return tokens
def tokenize_one_token(self) -> Token:
for token_type, regex in TOKEN_TYPES:
regex = fr"\A({regex})"
match = re.match(regex, self.code)
if match:
value = match.group(1)
self.code = self.code[len(value):]
return Token(type=token_type, value=value)
raise ValueError(f"Couldn't match token on {self.code!r}")
class Parser:
def __init__(self, tokens: List[Token]):
self.tokens = tokens
def parse(self):
return self.parse_def()
def parse_def(self):
self.consume("def")
name = self.consume("identifier").value
arg_names = self.parse_arg_names()
body = self.parse_expr()
self.consume("end")
return DefNode(name=name, arg_names=arg_names, body=body)
def parse_arg_names(self):
arg_names = []
self.consume("oparen")
if self.peek("identifier"):
arg_names.append(self.consume("identifier").value)
while self.peek("comma"):
self.consume("comma")
arg_names.append(self.consume("identifier").value)
self.consume("cparen")
return arg_names
def parse_expr(self):
if self.peek("integer"):
return self.parse_integer()
elif self.peek("identifier") and self.peek("oparen", 1):
return self.parse_call()
else:
return self.parse_var_ref()
def parse_integer(self):
return IntegerNode(value=int(self.consume("integer").value))
def parse_call(self):
name = self.consume("identifier").value
arg_exprs = self.parse_arg_exprs()
return CallNode(name=name, arg_exprs=arg_exprs)
def parse_arg_exprs(self):
arg_exprs = []
self.consume("oparen")
if not self.peek("cparen"):
arg_exprs.append(self.parse_expr())
while self.peek("comma"):
self.consume("comma")
arg_exprs.append(self.parse_expr())
self.consume("cparen")
return arg_exprs
def parse_var_ref(self):
return VarRefNode(value=self.consume("identifier").value)
def consume(self, expected_type):
token = self.tokens.pop(0)
if token.type == expected_type:
return token
else:
raise ValueError(f"Expected token type {expected_type!r} but got {token.type!r}")
def peek(self, expected_type, offset=0):
return self.tokens[offset].type == expected_type
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment