Created
March 25, 2023 05:25
-
-
Save danesherbs/a57a91a6c5538ddd9e0801fc3e217423 to your computer and use it in GitHub Desktop.
A tiny compiler for a Ruby-like language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A teeny tiny compiler!""" | |
import re | |
from typing import List, Tuple | |
from dataclasses import dataclass | |
TOKEN_TYPES = [ | |
("def", r"\bdef\b"), | |
("end", r"\bend\b"), | |
("identifier", r"\b[a-zA-Z]+\b"), | |
("integer", r"\b[0-9]+\b"), | |
("oparen", r"\("), | |
("cparen", r"\)"), | |
("comma", r",") | |
] | |
@dataclass | |
class Token: | |
type: str | |
value: str | |
@dataclass | |
class DefNode: | |
name: str | |
arg_names: List[str] | |
body: any | |
@dataclass | |
class IntegerNode: | |
value: int | |
class Tokenizer: | |
def __init__(self, code: str): | |
self.code = code | |
def tokenize(self) -> List[Token]: | |
tokens = [] | |
while self.code: | |
token = self.tokenize_one_token() | |
self.code = self.code.strip() | |
tokens.append(token) | |
return tokens | |
def tokenize_one_token(self) -> Token: | |
for token_type, regex in TOKEN_TYPES: | |
regex = fr"\A({regex})" | |
match = re.match(regex, self.code) | |
if match: | |
value = match.group(1) | |
self.code = self.code[len(value):] | |
return Token(type=token_type, value=value) | |
raise ValueError(f"Couldn't match token on {self.code!r}") | |
class Parser: | |
def __init__(self, tokens: List[Token]): | |
self.tokens = tokens | |
def parse(self): | |
return self.parse_def() | |
def parse_def(self): | |
self.consume("def") | |
name = self.consume("identifier").value | |
arg_names = self.parse_arg_names() | |
body = self.parse_expr() | |
self.consume("end") | |
return DefNode(name=name, arg_names=arg_names, body=body) | |
def parse_arg_names(self): | |
arg_names = [] | |
self.consume("oparen") | |
if self.peek("identifier"): | |
arg_names.append(self.consume("identifier").value) | |
while self.peek("comma"): | |
self.consume("comma") | |
arg_names.append(self.consume("identifier").value) | |
self.consume("cparen") | |
return arg_names | |
def parse_expr(self): | |
if self.peek("integer"): | |
return self.parse_integer() | |
elif self.peek("identifier") and self.peek("oparen", 1): | |
return self.parse_call() | |
else: | |
return self.parse_var_ref() | |
def parse_integer(self): | |
return IntegerNode(value=int(self.consume("integer").value)) | |
def parse_call(self): | |
name = self.consume("identifier").value | |
arg_exprs = self.parse_arg_exprs() | |
return CallNode(name=name, arg_exprs=arg_exprs) | |
def parse_arg_exprs(self): | |
arg_exprs = [] | |
self.consume("oparen") | |
if not self.peek("cparen"): | |
arg_exprs.append(self.parse_expr()) | |
while self.peek("comma"): | |
self.consume("comma") | |
arg_exprs.append(self.parse_expr()) | |
self.consume("cparen") | |
return arg_exprs | |
def parse_var_ref(self): | |
return VarRefNode(value=self.consume("identifier").value) | |
def consume(self, expected_type): | |
token = self.tokens.pop(0) | |
if token.type == expected_type: | |
return token | |
else: | |
raise ValueError(f"Expected token type {expected_type!r} but got {token.type!r}") | |
def peek(self, expected_type, offset=0): | |
return self.tokens[offset].type == expected_type |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment