Skip to content

Instantly share code, notes, and snippets.

@silphendio
Created April 24, 2024 20:45
Show Gist options
  • Save silphendio/0abad80b71c71220a6c98d23eb8bb598 to your computer and use it in GitHub Desktop.
Save silphendio/0abad80b71c71220a6c98d23eb8bb598 to your computer and use it in GitHub Desktop.
lexer for my own programming language
from dataclasses import dataclass
import re
ObjType = int
class Primitives:
[STRING, CHAR, INT, FLOAT, SYMBOL] = range(1, 6)
@dataclass
class Object:
type: ObjType
data: any = None
src_pos: int = 0 # place in the source, for debugging & errors
re_comment = re.compile(r"//.*") # // ...
re_block_comment = re.compile(r"/\*.*?\*/", re.DOTALL) # /* ... */
re_whitespace = re.compile(r"\s+", re.DOTALL)
re_identifier = re.compile(r"[^\d\W]\w*")
re_bracket = re.compile(r"[\(\)\[\]\{\}]")
# any combination of these (except comments): ! # $ % & * + , - . / : ; < = > ? @ ^ | ~
# multiple operators in a row must be separated by whitespace (e.g. `TypeA<TypeB< -1> >` )
re_operator = re.compile(r"[\!\#\$\%\&\*\+\,\-\.\/\:\;\<\=\>\?\@\^\|\~]+")
# single or double quotes, re.DOTALL for multiline strings
re_string = re.compile(r'"([^\\"]|(\\.))*"', re.DOTALL)
re_char = re.compile(r"'([^\\']|(\\.))*'")
re_raw_string = re.compile(r'r"(#*).*?"\1', re.DOTALL)
re_int = re.compile(r"((0[bo])?[0-9]+)|(0x[0-9a-f]+)", re.IGNORECASE | re.ASCII)
# nan/inf are symbols, defined elsewhere
re_float = re.compile(r"(\.[0-9]+)|([0-9]+\.?[0-9]*)(e[-+]?[0-9]+)?", re.IGNORECASE)
# hex floats, exponent is mandatory (C99 / C++17, python doesn't support them)
#re_hex_float = re.compile(r"0x(\.[0-9a-f]+)|([0-9a-f]+\.?[0-9a-f]*)p[-+]?[0-9]+", re.IGNORECASE)
def read_str(data: str) -> any:
return data[1:-1].encode('raw_unicode_escape').decode('unicode_escape')
def read_raw_str(data: str) -> any:
j = data.find('"')
return data[j:len(data)-j]
# (regex, type, conversion function)
# the order is important
patterns = [
(re_comment, None, None),
(re_block_comment, None, None),
(re_whitespace, None, None),
(re_raw_string, Primitives.STRING, read_raw_str),
(re_identifier, Primitives.SYMBOL, lambda x:x),
(re_bracket, Primitives.SYMBOL, lambda x:x),
(re_operator, Primitives.SYMBOL, lambda x:x),
(re_string, Primitives.STRING, read_str),
(re_char, Primitives.CHAR, read_str),
(re_int, Primitives.INT, int),
(re_float, Primitives.FLOAT, float),
]
def tokenize(source: str):
tokens = []
i = 0
while i < len(source):
for regex, obj_type, get_data in patterns:
match = regex.match(source, i)
if match:
if obj_type is None:
i = match.end()
break
data = get_data(match.group())
tokens.append(Object(obj_type, data, i))
i = match.end()
break
else:
raise ValueError(f"tokenizer error at pos {i}")
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment