kergoth · May 10, 2011 00:38 · esben · May 10, 2011 · kergoth · May 10, 2011
diff --git a/README.rst b/README.rst
diff --git a/indent_lexer.py b/indent_lexer.py
 import lexer
 import re

 class IndentTokenizer(lexer.Tokenizer):
    """Tokenizer which tracks indentation, for parsing python-like strings"""
    
    _white = r'(\s+)(?=\S)'
    _line = r'.*(?=\n)'
    _spec = [
        ('NEWLINE', r'\n'),
        ('LINE', _line),
    ]

    def __init__(self, string, tok_spec=None):
        self.current_indent = 0
        self.indents = []
        self.pending = []
        self.after_newline = True
        if tok_spec is None:
            tok_spec = self._spec
        lexer.Tokenizer.__init__(self, string, tok_spec)

    def next_token(self):
        if self.pending:
            return self.pending.pop(0)

        token = lexer.Tokenizer.next_token(self)
        if not token and self.indents:
            return lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
        return token

    def process_match(self, match, old_position):
        typ = match.lastgroup
        if typ == 'NEWLINE':
            self.after_newline = True
        else:
            if self.after_newline:
                self.after_newline = False
                obj = self.handle_indents(old_position)
                if obj:
                    self.position = old_position
                    return obj
        return lexer.Tokenizer.process_match(self, match, old_position)

    def handle_indents(self, position):
        indents = list(self.process_indent(position))
        if indents:
            self.pending.extend(indents[1:])
            return indents[0]

    def process_indent(self, position):
        line = re.compile(self._line).match(self.string, position)
        if line:
            line_value = line.group()
            if line_value.strip():
                matched = re.match(self._white, line_value)
                if matched:
                    indent = matched.group(1)
                    if self.indents and indent == self.indents[-1]:
                        pass
                    elif len(self.indents) > 1 and indent == self.indents[-2]:
                        # dedent
                        yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
                    else:
                        self.indents.append(indent)
                        yield lexer.Token('INDENT', indent, self.line - 1, 0)
                elif self.indents:
                    while self.indents:
                        yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)

 def parse_function(string, position):
    body = []
    indent = 0
    tokenizer = IndentTokenizer(string)
    tokenizer.position = position
    for token in tokenizer:
        if token.typ == 'INDENT':
            indent += 1
        elif token.typ == 'DEDENT':
            indent -= 1
            if not indent:
                break
        else:
            body.append(token.value)
    return [line + '\n' for line in body], tokenizer.position

 if __name__ == '__main__':
    teststring = """
 def foo(value):
    print(value * 5)

    # foo
    def anotherfunc(anothervalue):
        print('hi, mom!')
        return anothervalue + 6
    
    return anotherfunc(12)

 print("hi, mom!")
 """
    body, position = parse_function(teststring, 0)
    print(''.join(body))
    assert teststring[position:] == 'print("hi, mom!")\n'
diff --git a/lexer.py b/lexer.py
 import collections
 import re
 import sys


 Token = collections.namedtuple('Token', 'typ value line column')

 class Tokenizer(object):
    """Simple regular expression based tokenizer with supports for lexing states.

    Based on http://docs.python.org/dev/library/re.html#writing-a-tokenizer.
    """
    def __init__(self, string, tok_spec=None, keywords=None):
        self.states = []
        self.string = string
        self.line = 1
        self.position = self.line_start = 0
        self.newline_tokens = False
        self.length = len(string)

        if keywords is None:
            keywords = []
        self.keywords = keywords

        if tok_spec is not None:
            self.push_state(tok_spec)

    def push_state(self, tok_spec):
        spec = [elem for elem in tok_spec if not elem[0].startswith('_')]
        match = self.spec_match(spec)
        self.states.append((tok_spec, match))

    def pop_state(self):
        self.states.pop()

    def spec_match(self, tok_spec):
        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
        match = re.compile(tok_re).match
        return lambda: match(self.string, self.position)

    def next(self):
        token_obj = self.next_token()
        if token_obj:
            return token_obj

        if self.position != self.length:
            raise RuntimeError('Unexpected character %r on line %d' %(self.string[self.position], self.line))
        else:
            raise StopIteration()

    def next_token(self):
        specification, match = self.states[-1]
        for matched in iter(match, None):
            old_position = self.position
            self.position = matched.end()
            token = self.process_match(matched, old_position)
            if token:
                if token.typ == 'IDENTIFIER' and token.value in self.keywords:
                    return token._replace(typ='KEYWORD')
                return token

    def process_match(self, match, old_position):
        typ = match.lastgroup
        if typ == 'NEWLINE':
            self.line_start = old_position
            self.line += 1
        elif typ != 'SKIP':
            return self.generate_token(match)

    def generate_token(self, match):
        typ = match.lastgroup
        value = match.group(typ)
        return Token(typ, value, self.line, match.start() - self.line_start)

    def __iter__(self):
        return self

 def get_tokens(string):
    specification = [
        ('OPERATOR', r'(=[+.]|[+.:?]=|=)'),
        ('LPAREN', r'\('),
        ('RPAREN', r'\)'),
        ('LBRACE', r'{'),
        ('RBRACE', r'}'),
        ('COLON', r':'),
        ('IDENTIFIER', r'[a-zA-Z0-9+-_.${}/]+'),
        ('COMMENT', r'#.*(?=\n)'),
        ('NEWLINE', r'\n'),
        ('SKIP', r'[ \t]'),
    ]
    line_spec = [
        ('NEWLINE', r'\n'),
        ('SKIP', r'[ \t]+'),
        ('COMMENT', r'#.*(?=\n)'),
        ('LINE', r'.*(?=\n)'),
    ]
    tokenizer = Tokenizer(string, specification, keywords=['inherit', 'include', 'require', 'addtask', 'export',
                                                           'before', 'after', 'python', 'EXPORT_FUNCTIONS'])
    for token in tokenizer:
        if token.typ == 'OPERATOR':
            yield token
            tokenizer.push_state(line_spec)
        elif token.typ == 'LINE':
            tokenizer.pop_state()
            yield token._replace(typ='VALUE')
        else:
            yield token

 def main():
    teststring = """
 inherit foo
 include bar
 require foo/bar.conf
 FOO = "bar"
 BAR := "foo bar"
 ALPHA += "beta"
 BETA .= 'theta'
 OMEGA =. omega

 # something commented
 TEST =+ "meh"

 python () {
    alpha
    beta
    theta
 }

 shellfunc () {
    echo foo
 }

 EXPORT_FUNCTIONS myfunc anotherfunc
 EXPORT_FUNCTIONS myfunc

 addtask some_task before this after that
 addtask some_task before this
 addtask some_task after that
 addtask some_task

 def get_something_or_other(d):
    def something_else():
        return 5
    return something_else() * 3
    """

    for token in get_tokens(teststring):
        print(token)


 if __name__ == '__main__':
    result = main()
    if not result:
        sys.exit(1)
	import lexer
	import re

	class IndentTokenizer(lexer.Tokenizer):
	"""Tokenizer which tracks indentation, for parsing python-like strings"""

	_white = r'(\s+)(?=\S)'
	_line = r'.*(?=\n)'
	_spec = [
	('NEWLINE', r'\n'),
	('LINE', _line),
	]

	def __init__(self, string, tok_spec=None):
	self.current_indent = 0
	self.indents = []
	self.pending = []
	self.after_newline = True
	if tok_spec is None:
	tok_spec = self._spec
	lexer.Tokenizer.__init__(self, string, tok_spec)

	def next_token(self):
	if self.pending:
	return self.pending.pop(0)

	token = lexer.Tokenizer.next_token(self)
	if not token and self.indents:
	return lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
	return token

	def process_match(self, match, old_position):
	typ = match.lastgroup
	if typ == 'NEWLINE':
	self.after_newline = True
	else:
	if self.after_newline:
	self.after_newline = False
	obj = self.handle_indents(old_position)
	if obj:
	self.position = old_position
	return obj
	return lexer.Tokenizer.process_match(self, match, old_position)

	def handle_indents(self, position):
	indents = list(self.process_indent(position))
	if indents:
	self.pending.extend(indents[1:])
	return indents[0]

	def process_indent(self, position):
	line = re.compile(self._line).match(self.string, position)
	if line:
	line_value = line.group()
	if line_value.strip():
	matched = re.match(self._white, line_value)
	if matched:
	indent = matched.group(1)
	if self.indents and indent == self.indents[-1]:
	pass
	elif len(self.indents) > 1 and indent == self.indents[-2]:
	# dedent
	yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
	else:
	self.indents.append(indent)
	yield lexer.Token('INDENT', indent, self.line - 1, 0)
	elif self.indents:
	while self.indents:
	yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)

	def parse_function(string, position):
	body = []
	indent = 0
	tokenizer = IndentTokenizer(string)
	tokenizer.position = position
	for token in tokenizer:
	if token.typ == 'INDENT':
	indent += 1
	elif token.typ == 'DEDENT':
	indent -= 1
	if not indent:
	break
	else:
	body.append(token.value)
	return [line + '\n' for line in body], tokenizer.position

	if __name__ == '__main__':
	teststring = """
	def foo(value):
	print(value * 5)

	# foo
	def anotherfunc(anothervalue):
	print('hi, mom!')
	return anothervalue + 6

	return anotherfunc(12)

	print("hi, mom!")
	"""
	body, position = parse_function(teststring, 0)
	print(''.join(body))
	assert teststring[position:] == 'print("hi, mom!")\n'
	import collections
	import re
	import sys


	Token = collections.namedtuple('Token', 'typ value line column')

	class Tokenizer(object):
	"""Simple regular expression based tokenizer with supports for lexing states.

	Based on http://docs.python.org/dev/library/re.html#writing-a-tokenizer.
	"""
	def __init__(self, string, tok_spec=None, keywords=None):
	self.states = []
	self.string = string
	self.line = 1
	self.position = self.line_start = 0
	self.newline_tokens = False
	self.length = len(string)

	if keywords is None:
	keywords = []
	self.keywords = keywords

	if tok_spec is not None:
	self.push_state(tok_spec)

	def push_state(self, tok_spec):
	spec = [elem for elem in tok_spec if not elem[0].startswith('_')]
	match = self.spec_match(spec)
	self.states.append((tok_spec, match))

	def pop_state(self):
	self.states.pop()

	def spec_match(self, tok_spec):
	tok_re = '\|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
	match = re.compile(tok_re).match
	return lambda: match(self.string, self.position)

	def next(self):
	token_obj = self.next_token()
	if token_obj:
	return token_obj

	if self.position != self.length:
	raise RuntimeError('Unexpected character %r on line %d' %(self.string[self.position], self.line))
	else:
	raise StopIteration()

	def next_token(self):
	specification, match = self.states[-1]
	for matched in iter(match, None):
	old_position = self.position
	self.position = matched.end()
	token = self.process_match(matched, old_position)
	if token:
	if token.typ == 'IDENTIFIER' and token.value in self.keywords:
	return token._replace(typ='KEYWORD')
	return token

	def process_match(self, match, old_position):
	typ = match.lastgroup
	if typ == 'NEWLINE':
	self.line_start = old_position
	self.line += 1
	elif typ != 'SKIP':
	return self.generate_token(match)

	def generate_token(self, match):
	typ = match.lastgroup
	value = match.group(typ)
	return Token(typ, value, self.line, match.start() - self.line_start)

	def __iter__(self):
	return self

	def get_tokens(string):
	specification = [
	('OPERATOR', r'(=[+.]\|[+.:?]=\|=)'),
	('LPAREN', r'\('),
	('RPAREN', r'\)'),
	('LBRACE', r'{'),
	('RBRACE', r'}'),
	('COLON', r':'),
	('IDENTIFIER', r'[a-zA-Z0-9+-_.${}/]+'),
	('COMMENT', r'#.*(?=\n)'),
	('NEWLINE', r'\n'),
	('SKIP', r'[ \t]'),
	]
	line_spec = [
	('NEWLINE', r'\n'),
	('SKIP', r'[ \t]+'),
	('COMMENT', r'#.*(?=\n)'),
	('LINE', r'.*(?=\n)'),
	]
	tokenizer = Tokenizer(string, specification, keywords=['inherit', 'include', 'require', 'addtask', 'export',
	'before', 'after', 'python', 'EXPORT_FUNCTIONS'])
	for token in tokenizer:
	if token.typ == 'OPERATOR':
	yield token
	tokenizer.push_state(line_spec)
	elif token.typ == 'LINE':
	tokenizer.pop_state()
	yield token._replace(typ='VALUE')
	else:
	yield token

	def main():
	teststring = """
	inherit foo
	include bar
	require foo/bar.conf
	FOO = "bar"
	BAR := "foo bar"
	ALPHA += "beta"
	BETA .= 'theta'
	OMEGA =. omega

	# something commented
	TEST =+ "meh"

	python () {
	alpha
	beta
	theta
	}

	shellfunc () {
	echo foo
	}

	EXPORT_FUNCTIONS myfunc anotherfunc
	EXPORT_FUNCTIONS myfunc

	addtask some_task before this after that
	addtask some_task before this
	addtask some_task after that
	addtask some_task

	def get_something_or_other(d):
	def something_else():
	return 5
	return something_else() * 3
	"""

	for token in get_tokens(teststring):
	print(token)


	if __name__ == '__main__':
	result = main()
	if not result:
	sys.exit(1)