Last active
September 27, 2020 10:14
-
-
Save louisswarren/2c9e3b20b66b5ce1c23b2c73cd42a114 to your computer and use it in GitHub Desktop.
BNF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import namedtuple | |
import re | |
# Literal patterns only match themselves, but quack like regular expressions | |
class LiteralPattern(str): | |
def match(self, other): | |
if other.startswith(self): | |
return LiteralMatch(str(self)) | |
return None | |
class LiteralMatch(namedtuple('LiteralMatch', 'literal')): | |
def group(self): | |
return self.literal | |
class Lookahead: | |
def __init__(self, it): | |
self.it = it | |
self.lookahead_exception = None | |
self._set_lookahead() | |
def __bool__(self): | |
return self.lookahead_exception is None | |
def _set_lookahead(self): | |
try: | |
self._lookahead = next(self.it) | |
except Exception as e: | |
self._lookahead = None | |
self.lookahead_exception = e | |
def peek(self): | |
if not self: | |
raise self.lookahead_exception | |
return self._lookahead | |
def __iter__(self): | |
while self: | |
yield next(self) | |
try: | |
raise self.lookahead_exception | |
except StopIteration: | |
return | |
def __next__(self): | |
if not self: | |
raise self.lookahead_exception | |
value = self._lookahead | |
self._set_lookahead() | |
return value | |
class LexError(Exception): | |
def __init__(self, src, index, message=None): | |
if message is None: | |
line_num = src[:index].count("\n") + 1 | |
self.message = message or f'Failed to lex on line {line_num}' | |
else: | |
self.message = message | |
super(LexError, self).__init__(self.message) | |
self.src = src | |
self.index = index | |
def pretty_print(self): | |
start = self.src[:self.index].rfind('\n') + 1 | |
end = self.src.find('\n', start) | |
line = self.src[start:end] if end > 0 else self.src[start:] | |
print(self.message) | |
print(line) | |
print(' ' * (self.index - start) + '^') | |
class TokenMatch(namedtuple('TokenMatch', 'token literal')): | |
def __gt__(self, other): | |
return len(self.literal) > len(other.literal) | |
def matching_tokens(token_list, src): | |
for token_name, token_re in token_list: | |
if (m := re.match(token_re, src)): | |
yield TokenMatch(token_name, m.group()) | |
def tokenise(token_list, src): | |
i = 0 | |
while i < len(src): | |
if src[i].isspace(): | |
i += 1 | |
continue | |
best_match = max(matching_tokens(token_list, src[i:]), default=None) | |
if best_match is None: | |
raise LexError(src, i) | |
yield best_match | |
i += len(best_match.literal) | |
def lex(token_list, src): | |
return Lookahead(tokenise(token_list, src)) | |
if __name__ == '__main__': | |
tl = [('HELLO', LiteralPattern('hello')), | |
('NUM', re.compile('[0-9]+'))] | |
try: | |
lx = lex(tl, "hello 1241") | |
for tok, lit in lx: | |
print(tok, repr(lit)) | |
except LexError as err: | |
err.pretty_print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment