Skip to content

Instantly share code, notes, and snippets.

@pfmiles
Created January 19, 2012 05:57
Show Gist options
  • Save pfmiles/1638250 to your computer and use it in GitHub Desktop.
Save pfmiles/1638250 to your computer and use it in GitHub Desktop.
Python iterable lexer using regexp, code skeleton...
import re
class TokenType(object):
QUANTIFIER, ESCAPE_CHAR, UNICODE_CHAR, LEFT_PAREN, RIGHT_PAREN, OR, ALPHA, DIGIT, DOT, LEFT_BRACKET, NOT_IN, RIGHT_BRACKET, DASH, DERIVES, UNDER_SCORE, EOF, WHITE_SPACE = range(17)
def lexer(code):
"""
The lexical analysis function for itself's lexical rules. This lexer is 'iterable',
thus you can traverse all the tokens in the for..in statement til the EOF token reached.
"""
lexerPatterns = re.compile(
"""
(?P<QUANTIFIER>[\\?\\+\\*])
|(?P<ESCAPE_CHAR>\\\\[btnfr\\\\\\\\\\.\\?\\+\\*\\(\\)\\|\\[\\^\\]\\-\\:\\=])
|(?P<UNICODE_CHAR>\\\\u[0-9a-f][0-9a-f][0-9a-f][0-9a-f])
|......your regExps for tokens goes here...
""", re.U | re.S | re.X)
eofReturned = False
cur = 0 # current processing position of code
curRow = 0
curCol = 0
while not eofReturned and cur < len(code):
rst = re.match(lexerPatterns, code[cur:])
if rst == None:
print "Unexpected char encountered at row: " + `curRow + 1` + ", col: " + `curCol + 1` + ", incoming chars: '" + code[cur:cur + 10] + "...'"
exit(1)
ms = [m for m in rst.groupdict().iteritems() if m[1] != None]
yield (getattr(TokenType, ms[0][0]), curRow, curCol, rst.group())
cur += rst.end()
for c in rst.group():
if c == '\n':
curRow += 1
curCol = 0
else:
curCol += 1
if not eofReturned:
yield (TokenType.EOF, curRow, curCol, "")
eofReturned = True
@pfmiles
Copy link
Author

pfmiles commented Jan 19, 2012

usage:
tokens = [t for t in lexer(code)]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment