Created
September 9, 2023 14:29
-
-
Save nikhilr612/a25b80195e97edd7cda209f99c7f4c03 to your computer and use it in GitHub Desktop.
A toy S-expressions parser in pure python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# A toy S-expressions lexer and parser written in pure python. | |
# Copyright 2023 Nikhil R. | |
# | |
from collections import namedtuple | |
from io import StringIO | |
T_INT_LITERAL = 1; | |
T_FLOAT_LITERAL = 2; | |
T_STRING_LITERAL = 3; | |
T_OPEN_PAREN = 4; | |
T_CLOSE_PAREN = 6; | |
T_SYMBOL = 7; | |
T_BOOLEAN = 8; | |
OPEN_PARENTHESIS = '(['; | |
CLOSE_PARENTHESIS= '])'; | |
STRING_QUOTES = '\"'; | |
Token = namedtuple('Token', ['type', 'data']); | |
def lexer(file): | |
""" | |
Generator to yield Tokens from file-like objects. | |
""" | |
# Using an FSM with : | |
# 1 - await whitespace, result in symbol | |
# 2 - expect ) or ], or if neither then new token | |
# 3 - string, await " | |
# 4 - expect digits, result in int | |
# 5 - expect digits, result in float | |
state = 2; | |
char_buf = []; | |
while (ch := file.read(1)): | |
if state == 1: | |
if ch.isspace(): | |
symbol = ''.join(char_buf); | |
char_buf.clear(); | |
state = 2; | |
if symbol == "True": | |
yield Token(T_BOOLEAN, True); | |
elif symbol == "False": | |
yield Token(T_BOOLEAN, False); | |
else: | |
yield Token(T_SYMBOL, symbol); | |
elif ch in CLOSE_PARENTHESIS: | |
symbol = ''.join(char_buf); | |
char_buf.clear(); | |
state = 2; | |
if symbol == "True": | |
yield Token(T_BOOLEAN, True); | |
elif symbol == "False": | |
yield Token(T_BOOLEAN, False); | |
else: | |
yield Token(T_SYMBOL, symbol); | |
yield Token(T_CLOSE_PAREN, ch); | |
else: | |
char_buf.append(ch); | |
elif state == 2: | |
if ch in CLOSE_PARENTHESIS: | |
yield Token(T_CLOSE_PAREN, ch); | |
elif ch in OPEN_PARENTHESIS: | |
yield Token(T_OPEN_PAREN, ch); | |
elif ch in STRING_QUOTES: | |
state = 3 | |
elif ch.isspace(): | |
pass | |
elif ch.isdigit() or ch in '-+': | |
state = 4; | |
char_buf.append(ch); | |
else: | |
char_buf.append(ch); | |
state = 1; | |
elif state == 3: | |
if ch == '\\': | |
next_char = file.read(1) | |
char_buf.append(ch); | |
char_buf.append(next_char); | |
elif ch in STRING_QUOTES: | |
s = ''.join(char_buf).encode('utf-8').decode('unicode-escape'); | |
state = 2; | |
char_buf.clear(); | |
yield Token(T_STRING_LITERAL, s) | |
else: | |
char_buf.append(ch); | |
elif state == 4: | |
if ch == '.' or ch == 'e': | |
state = 5; | |
char_buf.append(ch); | |
elif ch.isdigit() or (len(char_buf) == 0 and ch == '-'): | |
char_buf.append(ch); | |
elif ch.isspace(): | |
s = ''.join(char_buf) | |
char_buf.clear(); | |
try: | |
val = int(s); | |
except: | |
print("Invalid int literal: " + s); | |
else: | |
state = 2; | |
yield Token(T_INT_LITERAL, val); | |
elif ch in CLOSE_PARENTHESIS: | |
s = ''.join(char_buf) | |
char_buf.clear(); | |
try: | |
val = int(s); | |
except: | |
print("Invalid int literal: " + s); | |
else: | |
state = 2; | |
yield Token(T_INT_LITERAL, val); | |
yield Token(T_CLOSE_PAREN, ch); | |
else: | |
raise Exception("Invalid character '" + ch + "' expecting WHITESPACE or DIGIT") | |
return; | |
elif state == 5: | |
if ch.isspace(): | |
s = ''.join(char_buf); | |
char_buf.clear(); | |
try: | |
val = float(s); | |
except: | |
raise Exception("Invalid float literal " + s); | |
return; | |
else: | |
state = 2; | |
yield Token(T_FLOAT_LITERAL, val); | |
elif ch in CLOSE_PARENTHESIS: | |
s = ''.join(char_buf); | |
char_buf.clear(); | |
try: | |
val = float(s); | |
except: | |
raise Exception("Invalid float literal " + s); | |
return; | |
else: | |
state = 2; | |
yield Token(T_FLOAT_LITERAL, val); | |
yield Token(T_CLOSE_PAREN, ch); | |
elif (ch in '-e') or ch.isdigit(): | |
char_buf.append(ch); | |
else: | |
raise Exception("Invalid character '" + ch + "' expecting WHITESPACE or DIGIT") | |
## | |
# TODO: Remove the extreme code duplication that plagues this script. | |
## | |
def _recursive_build_ast(lex, parent): | |
token = next(lex) | |
try: | |
while token: | |
if token.type == T_OPEN_PAREN: | |
new_list = []; | |
parent.append(new_list); | |
_recursive_build_ast(lex, new_list); | |
elif token.type == T_CLOSE_PAREN: | |
return; | |
else: | |
parent.append(token); | |
token = next(lex); | |
except StopIteration: | |
raise Exception("Mismatched parenthesis."); | |
def build_ast(lex): | |
""" | |
Return the abstract syntax tree formed by processing the tokens yielded by the lexer. | |
""" | |
root = []; | |
token = next(lex, None) | |
while token: | |
if token.type == T_OPEN_PAREN: | |
new_list = []; | |
root.append(new_list); | |
_recursive_build_ast(lex, new_list); | |
elif token.type != T_CLOSE_PAREN: | |
root.append(token); | |
else: | |
raise Exception("Mismatched parenthesis."); | |
token = next(lex, None); | |
return root; | |
def _test_lex_string(s): | |
sfile = StringIO(s); | |
return list(lexer(sfile)); | |
def _test_parse_string(s): | |
sfile = StringIO(s); | |
return build_ast(lexer(sfile)); | |
def _run_tests(): | |
assert _test_lex_string("(]") == [Token(T_OPEN_PAREN, '('), Token(T_CLOSE_PAREN, ']')]; | |
d = _test_lex_string("( 1 1e3)"); | |
assert d == [ | |
Token(T_OPEN_PAREN, '('), | |
Token(T_INT_LITERAL, 1), | |
Token(T_FLOAT_LITERAL, 1000.0), | |
Token(T_CLOSE_PAREN, ')') | |
]; | |
d = _test_lex_string('(define (sin x) ("not implemented"))'); | |
assert d == [ | |
Token(T_OPEN_PAREN, '('), | |
Token(T_SYMBOL, 'define'), | |
Token(T_OPEN_PAREN, '('), | |
Token(T_SYMBOL, 'sin'), | |
Token(T_SYMBOL, 'x'), | |
Token(T_CLOSE_PAREN, ')'), | |
Token(T_OPEN_PAREN, '('), | |
Token(T_STRING_LITERAL, 'not implemented'), | |
Token(T_CLOSE_PAREN, ')'), | |
Token(T_CLOSE_PAREN, ')') | |
]; | |
d = _test_lex_string('(string-escape "\\t")') | |
assert d == [ | |
Token(T_OPEN_PAREN, '('), | |
Token(T_SYMBOL, 'string-escape'), | |
Token(T_STRING_LITERAL, '\t'), | |
Token(T_CLOSE_PAREN, ')') | |
]; | |
d = _test_parse_string("(display True (cond [(> a 5) 'A] [(!= a 0) 'B]) (random symbol))(another -1 -2.3e-2)"); | |
assert d == [ | |
[ # ( | |
Token(T_SYMBOL, 'display'), # display | |
Token(T_BOOLEAN, True), # True | |
[ # ( | |
Token(T_SYMBOL, 'cond'), # cond | |
[ # [ | |
[Token(T_SYMBOL, '>'), Token(T_SYMBOL, 'a'), Token(T_INT_LITERAL, 5)], # (> a 5) | |
Token(T_SYMBOL, "'A") # 'A | |
], # ] | |
[ # [ | |
[Token(T_SYMBOL, '!='), Token(T_SYMBOL, 'a'), Token(T_INT_LITERAL, 0)], # (!= a 0) | |
Token(T_SYMBOL, "'B") # 'B | |
], # ] | |
], # ) | |
[Token(T_SYMBOL, 'random'), Token(T_SYMBOL, 'symbol')] # (random symbol) | |
], # ) | |
[ # ( | |
Token(T_SYMBOL, 'another'), # another | |
Token(T_INT_LITERAL, -1), # -1 | |
Token(T_FLOAT_LITERAL, -0.023) # -2.3e-2 | |
] # ) | |
]; | |
if __name__ == "__main__": | |
_run_tests(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment