Last active
July 15, 2020 01:02
-
-
Save louisswarren/cc4e79c1c429795d5877215c4af05634 to your computer and use it in GitHub Desktop.
the
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from lexer import lex, LexError | |
def readlines_iter(f): | |
while (line := f.readline()): | |
yield line[:-1] | |
def main(lines): | |
tok_patterns = {} | |
prods = {} | |
last_prod = None | |
for line in lines: | |
if line.strip().startswith('|='): | |
assert(last_prod) | |
prod, _, *terms = last_prod, *line.split() | |
else: | |
prod, op, termsline = line.split(maxsplit=2) | |
if op == '|=': | |
terms = termsline.split() | |
else: | |
assert(op == '~~') | |
terms = (termsline,) | |
last_prod = prod | |
for term in terms: | |
left, middle, right = term[0], term[1:-1], term[-1] | |
if left == '<' and right == '>': | |
prods[middle] = prods.get(middle, []) | |
elif left == '[' and right == ']': | |
assert(middle not in tok_patterns) | |
tok_patterns[middle] = None | |
elif left == '{' and right == '}': | |
assert(prod not in tok_patterns) | |
print("Adding regex for", prod, repr(middle)) | |
tok_patterns[prod] = re.compile(middle) | |
with open('test.the') as f: | |
try: | |
for tok, lit in lex(tok_patterns, f.read()): | |
print(f'[{tok}]' + f' {lit}' if lit else '') | |
except LexError as e: | |
e.pretty_print() | |
def handle(s): | |
linere = re.compile(r'((?P<term>\w+)\s*←)|((?P<tok>\w+)\s*≡)') | |
prodre = re.compile(r'\s*((⦃(?P<term>\w+)⦄)|(?P<lit>[^\s]+))') | |
m = linere.match(s) | |
if not m: | |
return False | |
if m.groupdict()['term']: | |
while m: | |
s = s[m.end():] | |
m = prodre.match(s) | |
print(m.groupdict()) | |
print("m", m) | |
print("m.groups()", m.groups()) | |
print("m.groupdict()", m.groupdict()) | |
handle("decl ← ⦃type⦄ declare ⦃space⦄ ⦃var⦄") | |
if __name__ == '__main__': | |
with open('the.bnf') as f: | |
pass | |
#bnf(readlines_iter(f)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import namedtuple | |
import re | |
class LexError(Exception): | |
def __init__(self, src, index, message=None): | |
if message is None: | |
line_num = src[:index].count("\n") + 1 | |
self.message = message or f'Failed to lex on line {line_num}' | |
else: | |
self.message = message | |
super(LexError, self).__init__(self.message) | |
self.src = src | |
self.index = index | |
def pretty_print(self): | |
line_start = self.src[:self.index].rfind('\n') + 1 | |
line = self.src[line_start:self.src.index('\n', line_start)] | |
print(self.message) | |
print(line) | |
print('^' * (1 + self.index - line_start)) | |
class TokenMatch(namedtuple('TokenMatch', 'token literal is_wild')): | |
def __gt__(self, other): | |
if len(self.literal) > len(other.literal): | |
return True | |
if len(self.literal) < len(other.literal): | |
return False | |
# Prioritise non-wild tokens | |
return not self.is_wild and other.is_wild | |
def matching_tokens(token_dict, src): | |
for token in token_dict: | |
if token_dict[token] and (m := re.match(token_dict[token], src)): | |
yield TokenMatch(token, m.group(), is_wild=True) | |
elif src.startswith(token): | |
yield TokenMatch(token, token, is_wild=False) | |
def tokenise(token_dict, src): | |
i = 0 | |
while i < len(src): | |
if src[i].isspace(): | |
i += 1 | |
continue | |
best_match = max(matching_tokens(token_dict, src[i:]), default=None) | |
if not best_match: | |
raise LexError(src, i) | |
i += len(best_match.literal) | |
yield best_match.token, best_match.is_wild and best_match.literal | |
class Lookahead: | |
def __init__(self, it): | |
self.it = it | |
self.is_empty = False | |
self.lookahead = next(it) | |
def __iter__(self): | |
while not self.is_empty: | |
yield next(self) | |
def __next__(self): | |
if self.is_empty: | |
raise StopIteration | |
value = self.lookahead | |
try: | |
self.lookahead = next(self.it) | |
except StopIteration: | |
self.is_empty = True | |
return value | |
def __bool__(self): | |
return not self.is_empty | |
def lex(token_dict, src): | |
return Lookahead(tokenise(token_dict, src)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
byte foo | |
print foo | |
print foo + foo |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type ← byte | |
← int | |
decl ← ⦃type⦄ ⦃space⦄ ⦃var⦄ | |
print ← print ⦃expr⦄ | |
expr ← ⦃var⦄ | |
← ⦃expr⦄ + ⦃expr⦄ | |
stmt ← ⦃print⦄ | |
← ⦃decl⦄ | |
var ≡ [_a-zA-Z][_0-9a-zA-Z]* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment