louisswarren · September 27, 2020 10:14
diff --git a/lexer.py b/lexer.py
 from collections import namedtuple
 import re

 # Literal patterns only match themselves, but quack like regular expressions

 class LiteralPattern(str):
    def match(self, other):
        if other.startswith(self):
            return LiteralMatch(str(self))
        return None

 class LiteralMatch(namedtuple('LiteralMatch', 'literal')):
    def group(self):
        return self.literal

 class Lookahead:
    def __init__(self, it):
        self.it = it
        self.lookahead_exception = None
        self._set_lookahead()

    def __bool__(self):
        return self.lookahead_exception is None

    def _set_lookahead(self):
        try:
            self._lookahead = next(self.it)
        except Exception as e:
            self._lookahead = None
            self.lookahead_exception = e

    def peek(self):
        if not self:
            raise self.lookahead_exception
        return self._lookahead

    def __iter__(self):
        while self:
            yield next(self)
        try:
            raise self.lookahead_exception
        except StopIteration:
            return

    def __next__(self):
        if not self:
            raise self.lookahead_exception
        value = self._lookahead
        self._set_lookahead()
        return value

 class LexError(Exception):
    def __init__(self, src, index, message=None):
        if message is None:
            line_num = src[:index].count("\n") + 1
            self.message = message or f'Failed to lex on line {line_num}'
        else:
            self.message = message
        super(LexError, self).__init__(self.message)
        self.src = src
        self.index = index

    def pretty_print(self):
        start = self.src[:self.index].rfind('\n') + 1
        end = self.src.find('\n', start)
        line = self.src[start:end] if end > 0 else self.src[start:]
        print(self.message)
        print(line)
        print(' ' * (self.index - start) + '^')

 class TokenMatch(namedtuple('TokenMatch', 'token literal')):
    def __gt__(self, other):
        return len(self.literal) > len(other.literal)

 def matching_tokens(token_list, src):
    for token_name, token_re in token_list:
        if (m := re.match(token_re, src)):
            yield TokenMatch(token_name, m.group())

 def tokenise(token_list, src):
    i = 0
    while i < len(src):
        if src[i].isspace():
            i += 1
            continue
        best_match = max(matching_tokens(token_list, src[i:]), default=None)
        if best_match is None:
            raise LexError(src, i)
        yield best_match
        i += len(best_match.literal)

 def lex(token_list, src):
    return Lookahead(tokenise(token_list, src))

 if __name__ == '__main__':
    tl = [('HELLO', LiteralPattern('hello')),
          ('NUM', re.compile('[0-9]+'))]
    try:
        lx = lex(tl, "hello 1241")
        for tok, lit in lx:
            print(tok, repr(lit))
    except LexError as err:
        err.pretty_print()
	from collections import namedtuple
	import re

	# Literal patterns only match themselves, but quack like regular expressions

	class LiteralPattern(str):
	def match(self, other):
	if other.startswith(self):
	return LiteralMatch(str(self))
	return None

	class LiteralMatch(namedtuple('LiteralMatch', 'literal')):
	def group(self):
	return self.literal

	class Lookahead:
	def __init__(self, it):
	self.it = it
	self.lookahead_exception = None
	self._set_lookahead()

	def __bool__(self):
	return self.lookahead_exception is None

	def _set_lookahead(self):
	try:
	self._lookahead = next(self.it)
	except Exception as e:
	self._lookahead = None
	self.lookahead_exception = e

	def peek(self):
	if not self:
	raise self.lookahead_exception
	return self._lookahead

	def __iter__(self):
	while self:
	yield next(self)
	try:
	raise self.lookahead_exception
	except StopIteration:
	return

	def __next__(self):
	if not self:
	raise self.lookahead_exception
	value = self._lookahead
	self._set_lookahead()
	return value

	class LexError(Exception):
	def __init__(self, src, index, message=None):
	if message is None:
	line_num = src[:index].count("\n") + 1
	self.message = message or f'Failed to lex on line {line_num}'
	else:
	self.message = message
	super(LexError, self).__init__(self.message)
	self.src = src
	self.index = index

	def pretty_print(self):
	start = self.src[:self.index].rfind('\n') + 1
	end = self.src.find('\n', start)
	line = self.src[start:end] if end > 0 else self.src[start:]
	print(self.message)
	print(line)
	print(' ' * (self.index - start) + '^')

	class TokenMatch(namedtuple('TokenMatch', 'token literal')):
	def __gt__(self, other):
	return len(self.literal) > len(other.literal)

	def matching_tokens(token_list, src):
	for token_name, token_re in token_list:
	if (m := re.match(token_re, src)):
	yield TokenMatch(token_name, m.group())

	def tokenise(token_list, src):
	i = 0
	while i < len(src):
	if src[i].isspace():
	i += 1
	continue
	best_match = max(matching_tokens(token_list, src[i:]), default=None)
	if best_match is None:
	raise LexError(src, i)
	yield best_match
	i += len(best_match.literal)

	def lex(token_list, src):
	return Lookahead(tokenise(token_list, src))

	if __name__ == '__main__':
	tl = [('HELLO', LiteralPattern('hello')),
	('NUM', re.compile('[0-9]+'))]
	try:
	lx = lex(tl, "hello 1241")
	for tok, lit in lx:
	print(tok, repr(lit))
	except LexError as err:
	err.pretty_print()