silphendio · April 24, 2024 20:45
diff --git a/lexer.py b/lexer.py
 from dataclasses import dataclass
 import re

 ObjType = int
 class Primitives:
    [STRING, CHAR, INT, FLOAT, SYMBOL] = range(1, 6)

 @dataclass
 class Object:
    type: ObjType
    data: any = None
    src_pos: int = 0 # place in the source, for debugging & errors


 re_comment = re.compile(r"//.*") # // ...
 re_block_comment = re.compile(r"/\*.*?\*/", re.DOTALL) # /* ... */
 re_whitespace = re.compile(r"\s+", re.DOTALL)

 re_identifier = re.compile(r"[^\d\W]\w*")
 re_bracket = re.compile(r"[\(\)\[\]\{\}]")

 # any combination of these (except comments): ! # $ % & * + , - . / : ; < = > ? @ ^ | ~
 # multiple operators in a row must be separated by whitespace (e.g. `TypeA<TypeB< -1> >` )
 re_operator = re.compile(r"[\!\#\$\%\&\*\+\,\-\.\/\:\;\<\=\>\?\@\^\|\~]+")

 # single or double quotes, re.DOTALL for multiline strings
 re_string = re.compile(r'"([^\\"]|(\\.))*"', re.DOTALL)
 re_char = re.compile(r"'([^\\']|(\\.))*'")
 re_raw_string = re.compile(r'r"(#*).*?"\1', re.DOTALL)

 re_int = re.compile(r"((0[bo])?[0-9]+)|(0x[0-9a-f]+)", re.IGNORECASE | re.ASCII)
 # nan/inf are symbols, defined elsewhere
 re_float = re.compile(r"(\.[0-9]+)|([0-9]+\.?[0-9]*)(e[-+]?[0-9]+)?", re.IGNORECASE)
 # hex floats, exponent is mandatory (C99 / C++17, python doesn't support them)
 #re_hex_float = re.compile(r"0x(\.[0-9a-f]+)|([0-9a-f]+\.?[0-9a-f]*)p[-+]?[0-9]+", re.IGNORECASE)


 def read_str(data: str) -> any:
    return data[1:-1].encode('raw_unicode_escape').decode('unicode_escape')

 def read_raw_str(data: str) -> any:
    j = data.find('"')
    return data[j:len(data)-j]

 # (regex, type, conversion function)
 # the order is important
 patterns = [
    (re_comment, None, None),
    (re_block_comment, None, None),
    (re_whitespace, None, None),
    (re_raw_string, Primitives.STRING, read_raw_str),
    (re_identifier, Primitives.SYMBOL, lambda x:x),
    (re_bracket, Primitives.SYMBOL, lambda x:x),
    (re_operator, Primitives.SYMBOL, lambda x:x),
    (re_string, Primitives.STRING, read_str),
    (re_char, Primitives.CHAR, read_str),
    (re_int, Primitives.INT, int),
    (re_float, Primitives.FLOAT, float),
 ]


 def tokenize(source: str):
    tokens = []
    i = 0
    while i < len(source):
        for regex, obj_type, get_data in patterns:
            match = regex.match(source, i)
            if match:
                if obj_type is None:
                    i = match.end()
                    break

                data = get_data(match.group())
                tokens.append(Object(obj_type, data, i))
                i = match.end()
                break
        else:
            raise ValueError(f"tokenizer error at pos {i}")

    return tokens
	from dataclasses import dataclass
	import re

	ObjType = int
	class Primitives:
	[STRING, CHAR, INT, FLOAT, SYMBOL] = range(1, 6)

	@dataclass
	class Object:
	type: ObjType
	data: any = None
	src_pos: int = 0 # place in the source, for debugging & errors


	re_comment = re.compile(r"//.*") # // ...
	re_block_comment = re.compile(r"/\.?\/", re.DOTALL) # / ... */
	re_whitespace = re.compile(r"\s+", re.DOTALL)

	re_identifier = re.compile(r"[^\d\W]\w*")
	re_bracket = re.compile(r"[\(\)\[\]\{\}]")

	# any combination of these (except comments): ! # $ % & * + , - . / : ; < = > ? @ ^ \| ~
	# multiple operators in a row must be separated by whitespace (e.g. `TypeA<TypeB< -1> >` )
	re_operator = re.compile(r"[\!\#\$\%\&\*\+\,\-\.\/\:\;\<\=\>\?\@\^\\|\~]+")

	# single or double quotes, re.DOTALL for multiline strings
	re_string = re.compile(r'"([^\\"]\|(\\.))*"', re.DOTALL)
	re_char = re.compile(r"'([^\\']\|(\\.))*'")
	re_raw_string = re.compile(r'r"(#).?"\1', re.DOTALL)

	re_int = re.compile(r"((0[bo])?[0-9]+)\|(0x[0-9a-f]+)", re.IGNORECASE \| re.ASCII)
	# nan/inf are symbols, defined elsewhere
	re_float = re.compile(r"(\.[0-9]+)\|([0-9]+\.?[0-9]*)(e[-+]?[0-9]+)?", re.IGNORECASE)
	# hex floats, exponent is mandatory (C99 / C++17, python doesn't support them)
	#re_hex_float = re.compile(r"0x(\.[0-9a-f]+)\|([0-9a-f]+\.?[0-9a-f]*)p[-+]?[0-9]+", re.IGNORECASE)


	def read_str(data: str) -> any:
	return data[1:-1].encode('raw_unicode_escape').decode('unicode_escape')

	def read_raw_str(data: str) -> any:
	j = data.find('"')
	return data[j:len(data)-j]

	# (regex, type, conversion function)
	# the order is important
	patterns = [
	(re_comment, None, None),
	(re_block_comment, None, None),
	(re_whitespace, None, None),
	(re_raw_string, Primitives.STRING, read_raw_str),
	(re_identifier, Primitives.SYMBOL, lambda x:x),
	(re_bracket, Primitives.SYMBOL, lambda x:x),
	(re_operator, Primitives.SYMBOL, lambda x:x),
	(re_string, Primitives.STRING, read_str),
	(re_char, Primitives.CHAR, read_str),
	(re_int, Primitives.INT, int),
	(re_float, Primitives.FLOAT, float),
	]


	def tokenize(source: str):
	tokens = []
	i = 0
	while i < len(source):
	for regex, obj_type, get_data in patterns:
	match = regex.match(source, i)
	if match:
	if obj_type is None:
	i = match.end()
	break

	data = get_data(match.group())
	tokens.append(Object(obj_type, data, i))
	i = match.end()
	break
	else:
	raise ValueError(f"tokenizer error at pos {i}")

	return tokens