Created
June 17, 2013 14:36
-
-
Save eliben/5797351 to your computer and use it in GitHub Desktop.
Generic regex-based lexer in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# lexer.py | |
# | |
# A generic regex-based Lexer/tokenizer tool. | |
# See the if __main__ section in the bottom for an example. | |
# | |
# Eli Bendersky ([email protected]) | |
# This code is in the public domain | |
# Last modified: August 2010 | |
#------------------------------------------------------------------------------- | |
import re | |
import sys | |
class Token(object): | |
""" A simple Token structure. | |
Contains the token type, value and position. | |
""" | |
def __init__(self, type, val, pos): | |
self.type = type | |
self.val = val | |
self.pos = pos | |
def __str__(self): | |
return '%s(%s) at %s' % (self.type, self.val, self.pos) | |
class LexerError(Exception): | |
""" Lexer error exception. | |
pos: | |
Position in the input line where the error occurred. | |
""" | |
def __init__(self, pos): | |
self.pos = pos | |
class Lexer(object): | |
""" A simple regex-based lexer/tokenizer. | |
See below for an example of usage. | |
""" | |
def __init__(self, rules, skip_whitespace=True): | |
""" Create a lexer. | |
rules: | |
A list of rules. Each rule is a `regex, type` | |
pair, where `regex` is the regular expression used | |
to recognize the token and `type` is the type | |
of the token to return when it's recognized. | |
skip_whitespace: | |
If True, whitespace (\s+) will be skipped and not | |
reported by the lexer. Otherwise, you have to | |
specify your rules for whitespace, or it will be | |
flagged as an error. | |
""" | |
# All the regexes are concatenated into a single one | |
# with named groups. Since the group names must be valid | |
# Python identifiers, but the token types used by the | |
# user are arbitrary strings, we auto-generate the group | |
# names and map them to token types. | |
# | |
idx = 1 | |
regex_parts = [] | |
self.group_type = {} | |
for regex, type in rules: | |
groupname = 'GROUP%s' % idx | |
regex_parts.append('(?P<%s>%s)' % (groupname, regex)) | |
self.group_type[groupname] = type | |
idx += 1 | |
self.regex = re.compile('|'.join(regex_parts)) | |
self.skip_whitespace = skip_whitespace | |
self.re_ws_skip = re.compile('\S') | |
def input(self, buf): | |
""" Initialize the lexer with a buffer as input. | |
""" | |
self.buf = buf | |
self.pos = 0 | |
def token(self): | |
""" Return the next token (a Token object) found in the | |
input buffer. None is returned if the end of the | |
buffer was reached. | |
In case of a lexing error (the current chunk of the | |
buffer matches no rule), a LexerError is raised with | |
the position of the error. | |
""" | |
if self.pos >= len(self.buf): | |
return None | |
else: | |
if self.skip_whitespace: | |
m = self.re_ws_skip.search(self.buf, self.pos) | |
if m: | |
self.pos = m.start() | |
else: | |
return None | |
m = self.regex.match(self.buf, self.pos) | |
if m: | |
groupname = m.lastgroup | |
tok_type = self.group_type[groupname] | |
tok = Token(tok_type, m.group(groupname), self.pos) | |
self.pos = m.end() | |
return tok | |
# if we're here, no rule matched | |
raise LexerError(self.pos) | |
def tokens(self): | |
""" Returns an iterator to the tokens found in the buffer. | |
""" | |
while 1: | |
tok = self.token() | |
if tok is None: break | |
yield tok | |
if __name__ == '__main__': | |
rules = [ | |
('\d+', 'NUMBER'), | |
('[a-zA-Z_]\w+', 'IDENTIFIER'), | |
('\+', 'PLUS'), | |
('\-', 'MINUS'), | |
('\*', 'MULTIPLY'), | |
('\/', 'DIVIDE'), | |
('\(', 'LP'), | |
('\)', 'RP'), | |
('=', 'EQUALS'), | |
] | |
lx = Lexer(rules, skip_whitespace=True) | |
lx.input('erw = _abc + 12*(R4-623902) ') | |
try: | |
for tok in lx.tokens(): | |
print(tok) | |
except LexerError as err: | |
print('LexerError at position %s' % err.pos) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very nice and useful.