- Verify that IndentTokenizer works with a non-LINE-based tokenizer specification, rather than the default
- Resurrect the 'NEWLINE' token, as we need it to be as picky about the file format as the current parser is
- Implement a parser which leverages this to properly change the lexer states for both ordinary functions and "def" syntax functions. Determine if this should be custom or PLY or codetalker or what, by first determining whether these libraries would support a lexer like ours
- Do performance testing comparing the new parser against the old, and against the pyparsing implementation
Created
May 10, 2011 00:38
-
-
Save kergoth/963723 to your computer and use it in GitHub Desktop.
Experiments with splitting the lexer out of the parser for BitBake's file format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lexer | |
import re | |
class IndentTokenizer(lexer.Tokenizer): | |
"""Tokenizer which tracks indentation, for parsing python-like strings""" | |
_white = r'(\s+)(?=\S)' | |
_line = r'.*(?=\n)' | |
_spec = [ | |
('NEWLINE', r'\n'), | |
('LINE', _line), | |
] | |
def __init__(self, string, tok_spec=None): | |
self.current_indent = 0 | |
self.indents = [] | |
self.pending = [] | |
self.after_newline = True | |
if tok_spec is None: | |
tok_spec = self._spec | |
lexer.Tokenizer.__init__(self, string, tok_spec) | |
def next_token(self): | |
if self.pending: | |
return self.pending.pop(0) | |
token = lexer.Tokenizer.next_token(self) | |
if not token and self.indents: | |
return lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0) | |
return token | |
def process_match(self, match, old_position): | |
typ = match.lastgroup | |
if typ == 'NEWLINE': | |
self.after_newline = True | |
else: | |
if self.after_newline: | |
self.after_newline = False | |
obj = self.handle_indents(old_position) | |
if obj: | |
self.position = old_position | |
return obj | |
return lexer.Tokenizer.process_match(self, match, old_position) | |
def handle_indents(self, position): | |
indents = list(self.process_indent(position)) | |
if indents: | |
self.pending.extend(indents[1:]) | |
return indents[0] | |
def process_indent(self, position): | |
line = re.compile(self._line).match(self.string, position) | |
if line: | |
line_value = line.group() | |
if line_value.strip(): | |
matched = re.match(self._white, line_value) | |
if matched: | |
indent = matched.group(1) | |
if self.indents and indent == self.indents[-1]: | |
pass | |
elif len(self.indents) > 1 and indent == self.indents[-2]: | |
# dedent | |
yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0) | |
else: | |
self.indents.append(indent) | |
yield lexer.Token('INDENT', indent, self.line - 1, 0) | |
elif self.indents: | |
while self.indents: | |
yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0) | |
def parse_function(string, position): | |
body = [] | |
indent = 0 | |
tokenizer = IndentTokenizer(string) | |
tokenizer.position = position | |
for token in tokenizer: | |
if token.typ == 'INDENT': | |
indent += 1 | |
elif token.typ == 'DEDENT': | |
indent -= 1 | |
if not indent: | |
break | |
else: | |
body.append(token.value) | |
return [line + '\n' for line in body], tokenizer.position | |
if __name__ == '__main__': | |
teststring = """ | |
def foo(value): | |
print(value * 5) | |
# foo | |
def anotherfunc(anothervalue): | |
print('hi, mom!') | |
return anothervalue + 6 | |
return anotherfunc(12) | |
print("hi, mom!") | |
""" | |
body, position = parse_function(teststring, 0) | |
print(''.join(body)) | |
assert teststring[position:] == 'print("hi, mom!")\n' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import re | |
import sys | |
Token = collections.namedtuple('Token', 'typ value line column') | |
class Tokenizer(object): | |
"""Simple regular expression based tokenizer with supports for lexing states. | |
Based on http://docs.python.org/dev/library/re.html#writing-a-tokenizer. | |
""" | |
def __init__(self, string, tok_spec=None, keywords=None): | |
self.states = [] | |
self.string = string | |
self.line = 1 | |
self.position = self.line_start = 0 | |
self.newline_tokens = False | |
self.length = len(string) | |
if keywords is None: | |
keywords = [] | |
self.keywords = keywords | |
if tok_spec is not None: | |
self.push_state(tok_spec) | |
def push_state(self, tok_spec): | |
spec = [elem for elem in tok_spec if not elem[0].startswith('_')] | |
match = self.spec_match(spec) | |
self.states.append((tok_spec, match)) | |
def pop_state(self): | |
self.states.pop() | |
def spec_match(self, tok_spec): | |
tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) | |
match = re.compile(tok_re).match | |
return lambda: match(self.string, self.position) | |
def next(self): | |
token_obj = self.next_token() | |
if token_obj: | |
return token_obj | |
if self.position != self.length: | |
raise RuntimeError('Unexpected character %r on line %d' %(self.string[self.position], self.line)) | |
else: | |
raise StopIteration() | |
def next_token(self): | |
specification, match = self.states[-1] | |
for matched in iter(match, None): | |
old_position = self.position | |
self.position = matched.end() | |
token = self.process_match(matched, old_position) | |
if token: | |
if token.typ == 'IDENTIFIER' and token.value in self.keywords: | |
return token._replace(typ='KEYWORD') | |
return token | |
def process_match(self, match, old_position): | |
typ = match.lastgroup | |
if typ == 'NEWLINE': | |
self.line_start = old_position | |
self.line += 1 | |
elif typ != 'SKIP': | |
return self.generate_token(match) | |
def generate_token(self, match): | |
typ = match.lastgroup | |
value = match.group(typ) | |
return Token(typ, value, self.line, match.start() - self.line_start) | |
def __iter__(self): | |
return self | |
def get_tokens(string): | |
specification = [ | |
('OPERATOR', r'(=[+.]|[+.:?]=|=)'), | |
('LPAREN', r'\('), | |
('RPAREN', r'\)'), | |
('LBRACE', r'{'), | |
('RBRACE', r'}'), | |
('COLON', r':'), | |
('IDENTIFIER', r'[a-zA-Z0-9+-_.${}/]+'), | |
('COMMENT', r'#.*(?=\n)'), | |
('NEWLINE', r'\n'), | |
('SKIP', r'[ \t]'), | |
] | |
line_spec = [ | |
('NEWLINE', r'\n'), | |
('SKIP', r'[ \t]+'), | |
('COMMENT', r'#.*(?=\n)'), | |
('LINE', r'.*(?=\n)'), | |
] | |
tokenizer = Tokenizer(string, specification, keywords=['inherit', 'include', 'require', 'addtask', 'export', | |
'before', 'after', 'python', 'EXPORT_FUNCTIONS']) | |
for token in tokenizer: | |
if token.typ == 'OPERATOR': | |
yield token | |
tokenizer.push_state(line_spec) | |
elif token.typ == 'LINE': | |
tokenizer.pop_state() | |
yield token._replace(typ='VALUE') | |
else: | |
yield token | |
def main(): | |
teststring = """ | |
inherit foo | |
include bar | |
require foo/bar.conf | |
FOO = "bar" | |
BAR := "foo bar" | |
ALPHA += "beta" | |
BETA .= 'theta' | |
OMEGA =. omega | |
# something commented | |
TEST =+ "meh" | |
python () { | |
alpha | |
beta | |
theta | |
} | |
shellfunc () { | |
echo foo | |
} | |
EXPORT_FUNCTIONS myfunc anotherfunc | |
EXPORT_FUNCTIONS myfunc | |
addtask some_task before this after that | |
addtask some_task before this | |
addtask some_task after that | |
addtask some_task | |
def get_something_or_other(d): | |
def something_else(): | |
return 5 | |
return something_else() * 3 | |
""" | |
for token in get_tokens(teststring): | |
print(token) | |
if __name__ == '__main__': | |
result = main() | |
if not result: | |
sys.exit(1) |
On Tue, May 10, 2011 at 1:07 AM, esben ***@***.*** wrote:
I have written a BitBake parser with PLY (ply.lex + ply.yacc), and has achieved comparable performance with current BitBake, and is not done optimizing yet.
I would very much like to share the work done with you.
## Nice, that sounds very promising. I'd love to check it out. Well done :)
Christopher Larson
clarson at kergoth dot com
Founder - BitBake, OpenEmbedded, OpenZaurus
Maintainer - Tslib
Senior Software Engineer, Mentor Graphics
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have written a BitBake parser with PLY (ply.lex + ply.yacc), and has achieved comparable performance with current BitBake, and is not done optimizing yet.
I would very much like to share the work done with you.