Created
April 24, 2024 20:45
-
-
Save silphendio/0abad80b71c71220a6c98d23eb8bb598 to your computer and use it in GitHub Desktop.
lexer for my own programming language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass | |
import re | |
ObjType = int | |
class Primitives: | |
[STRING, CHAR, INT, FLOAT, SYMBOL] = range(1, 6) | |
@dataclass | |
class Object: | |
type: ObjType | |
data: any = None | |
src_pos: int = 0 # place in the source, for debugging & errors | |
re_comment = re.compile(r"//.*") # // ... | |
re_block_comment = re.compile(r"/\*.*?\*/", re.DOTALL) # /* ... */ | |
re_whitespace = re.compile(r"\s+", re.DOTALL) | |
re_identifier = re.compile(r"[^\d\W]\w*") | |
re_bracket = re.compile(r"[\(\)\[\]\{\}]") | |
# any combination of these (except comments): ! # $ % & * + , - . / : ; < = > ? @ ^ | ~ | |
# multiple operators in a row must be separated by whitespace (e.g. `TypeA<TypeB< -1> >` ) | |
re_operator = re.compile(r"[\!\#\$\%\&\*\+\,\-\.\/\:\;\<\=\>\?\@\^\|\~]+") | |
# single or double quotes, re.DOTALL for multiline strings | |
re_string = re.compile(r'"([^\\"]|(\\.))*"', re.DOTALL) | |
re_char = re.compile(r"'([^\\']|(\\.))*'") | |
re_raw_string = re.compile(r'r"(#*).*?"\1', re.DOTALL) | |
re_int = re.compile(r"((0[bo])?[0-9]+)|(0x[0-9a-f]+)", re.IGNORECASE | re.ASCII) | |
# nan/inf are symbols, defined elsewhere | |
re_float = re.compile(r"(\.[0-9]+)|([0-9]+\.?[0-9]*)(e[-+]?[0-9]+)?", re.IGNORECASE) | |
# hex floats, exponent is mandatory (C99 / C++17, python doesn't support them) | |
#re_hex_float = re.compile(r"0x(\.[0-9a-f]+)|([0-9a-f]+\.?[0-9a-f]*)p[-+]?[0-9]+", re.IGNORECASE) | |
def read_str(data: str) -> any: | |
return data[1:-1].encode('raw_unicode_escape').decode('unicode_escape') | |
def read_raw_str(data: str) -> any: | |
j = data.find('"') | |
return data[j:len(data)-j] | |
# (regex, type, conversion function) | |
# the order is important | |
patterns = [ | |
(re_comment, None, None), | |
(re_block_comment, None, None), | |
(re_whitespace, None, None), | |
(re_raw_string, Primitives.STRING, read_raw_str), | |
(re_identifier, Primitives.SYMBOL, lambda x:x), | |
(re_bracket, Primitives.SYMBOL, lambda x:x), | |
(re_operator, Primitives.SYMBOL, lambda x:x), | |
(re_string, Primitives.STRING, read_str), | |
(re_char, Primitives.CHAR, read_str), | |
(re_int, Primitives.INT, int), | |
(re_float, Primitives.FLOAT, float), | |
] | |
def tokenize(source: str): | |
tokens = [] | |
i = 0 | |
while i < len(source): | |
for regex, obj_type, get_data in patterns: | |
match = regex.match(source, i) | |
if match: | |
if obj_type is None: | |
i = match.end() | |
break | |
data = get_data(match.group()) | |
tokens.append(Object(obj_type, data, i)) | |
i = match.end() | |
break | |
else: | |
raise ValueError(f"tokenizer error at pos {i}") | |
return tokens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment