pfmiles · January 19, 2012 05:57 · pfmiles · Jan 19, 2012
diff --git a/lexer.py b/lexer.py
 import re

 class TokenType(object):
    QUANTIFIER, ESCAPE_CHAR, UNICODE_CHAR, LEFT_PAREN, RIGHT_PAREN, OR, ALPHA, DIGIT, DOT, LEFT_BRACKET, NOT_IN, RIGHT_BRACKET, DASH, DERIVES, UNDER_SCORE, EOF, WHITE_SPACE = range(17)

 def lexer(code):
    """
    The lexical analysis function for itself's lexical rules. This lexer is 'iterable', 
    thus you can traverse all the tokens in the for..in statement til the EOF token reached.
    """
    lexerPatterns = re.compile(
                            """
                            (?P<QUANTIFIER>[\\?\\+\\*])
                            |(?P<ESCAPE_CHAR>\\\\[btnfr\\\\\\\\\\.\\?\\+\\*\\(\\)\\|\\[\\^\\]\\-\\:\\=])
                            |(?P<UNICODE_CHAR>\\\\u[0-9a-f][0-9a-f][0-9a-f][0-9a-f])
                            |......your regExps for tokens goes here...
                            """, re.U | re.S | re.X)
    eofReturned = False
    cur = 0 # current processing position of code
    curRow = 0
    curCol = 0
    while not eofReturned and cur < len(code):
        rst = re.match(lexerPatterns, code[cur:])
        if rst == None:
            print "Unexpected char encountered at row: " + `curRow + 1` + ", col: " + `curCol + 1` + ", incoming chars: '" + code[cur:cur + 10] + "...'"
            exit(1)
        ms = [m for m in rst.groupdict().iteritems() if m[1] != None]
        yield (getattr(TokenType, ms[0][0]), curRow, curCol, rst.group())
        cur += rst.end()
        for c in rst.group():
            if c == '\n':
                curRow += 1
                curCol = 0
            else:
                curCol += 1
    if not eofReturned:
        yield (TokenType.EOF, curRow, curCol, "")
        eofReturned = True
	import re

	class TokenType(object):
	QUANTIFIER, ESCAPE_CHAR, UNICODE_CHAR, LEFT_PAREN, RIGHT_PAREN, OR, ALPHA, DIGIT, DOT, LEFT_BRACKET, NOT_IN, RIGHT_BRACKET, DASH, DERIVES, UNDER_SCORE, EOF, WHITE_SPACE = range(17)

	def lexer(code):
	"""
	The lexical analysis function for itself's lexical rules. This lexer is 'iterable',
	thus you can traverse all the tokens in the for..in statement til the EOF token reached.
	"""
	lexerPatterns = re.compile(
	"""
	(?P<QUANTIFIER>[\\?\\+\\*])
	\|(?P<ESCAPE_CHAR>\\\\[btnfr\\\\\\\\\\.\\?\\+\\*\\(\\)\\\|\\[\\^\\]\\-\\:\\=])
	\|(?P<UNICODE_CHAR>\\\\u[0-9a-f][0-9a-f][0-9a-f][0-9a-f])
	\|......your regExps for tokens goes here...
	""", re.U \| re.S \| re.X)
	eofReturned = False
	cur = 0 # current processing position of code
	curRow = 0
	curCol = 0
	while not eofReturned and cur < len(code):
	rst = re.match(lexerPatterns, code[cur:])
	if rst == None:
	print "Unexpected char encountered at row: " + `curRow + 1` + ", col: " + `curCol + 1` + ", incoming chars: '" + code[cur:cur + 10] + "...'"
	exit(1)
	ms = [m for m in rst.groupdict().iteritems() if m[1] != None]
	yield (getattr(TokenType, ms[0][0]), curRow, curCol, rst.group())
	cur += rst.end()
	for c in rst.group():
	if c == '\n':
	curRow += 1
	curCol = 0
	else:
	curCol += 1
	if not eofReturned:
	yield (TokenType.EOF, curRow, curCol, "")
	eofReturned = True