Last active
September 7, 2016 11:30
-
-
Save milesrout/b73c235f03793dc208fd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import re | |
class Lexer: | |
def __init__(self, tokens): | |
self.tokens = tokens | |
self.regex = re.compile(self.combine_regexes(self.make_named_groups())) | |
def combine_regexes(self, named_groups): | |
return '|'.join(named_groups) | |
def make_named_group(self, key, value): | |
return "(?P<{0}>{1})".format(key.upper(), value) | |
def make_named_groups(self): | |
yield from itertools.starmap(self.make_named_group, self.tokens) | |
def tokens_from_file(self, filename): | |
with open(filename) as f: | |
yield from map(self.tokens_from_string, f.readlines()) | |
def tokens_from_string(self, string): | |
yield from map(first, self.match_groups(string)) | |
def match_groups(self, string): | |
yield from (match.groups() for match in self.matches(string)) | |
def matches(self, string): | |
yield from re.finditer(self.regex, string) | |
def first(it): | |
return next(x for x in it if x is not None) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tokens = [ | |
('if', 'if'), | |
('ident', '[a-zA-Z_]+'), | |
... | |
] | |
l = Lexer(tokens) | |
with open(filename, 'r') as f: | |
for line in f.readlines(): | |
print(' '.join(l.tokens_from_string(line))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment