Last active
August 29, 2015 13:57
-
-
Save andersonsp/9733408 to your computer and use it in GitHub Desktop.
Parser examples in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # see http://effbot.org/zone/simple-top-down-parsing.htm | |
| import sys | |
| import re | |
| try: | |
| # test binding for python's built-in tokenizer; see | |
| # http://svn.effbot.org/public/stuff/sandbox/pytoken | |
| import pytoken | |
| except ImportError: | |
| pytoken = None | |
| if 1: | |
| # symbol (token type) registry | |
| symbol_table = {} | |
| class symbol_base(object): | |
| id = None | |
| value = None | |
| first = second = third = None | |
| def nud(self): | |
| raise SyntaxError("Syntax error (%r)." % self.id) | |
| def led(self, left): | |
| raise SyntaxError("Unknown operator (%r)." % self.id) | |
| def __repr__(self): | |
| if self.id == "(name)" or self.id == "(literal)": | |
| return "(%s %s)" % (self.id[1:-1], self.value) | |
| out = [self.id, self.first, self.second, self.third] | |
| out = map(str, filter(None, out)) | |
| return "(" + " ".join(out) + ")" | |
| def symbol(id, bp=0): | |
| try: | |
| s = symbol_table[id] | |
| except KeyError: | |
| class s(symbol_base): | |
| pass | |
| s.__name__ = "symbol-" + id # for debugging | |
| s.id = id | |
| s.value = None | |
| s.lbp = bp | |
| symbol_table[id] = s | |
| else: | |
| s.lbp = max(bp, s.lbp) | |
| return s | |
| # helpers | |
| def infix(id, bp): | |
| def led(self, left): | |
| self.first = left | |
| self.second = expression(bp) | |
| return self | |
| symbol(id, bp).led = led | |
| def infix_r(id, bp): | |
| def led(self, left): | |
| self.first = left | |
| self.second = expression(bp-1) | |
| return self | |
| symbol(id, bp).led = led | |
| def prefix(id, bp): | |
| def nud(self): | |
| self.first = expression(bp) | |
| return self | |
| symbol(id).nud = nud | |
| def advance(id=None): | |
| global token | |
| if id and token.id != id: | |
| raise SyntaxError("Expected %r" % id) | |
| token = next() | |
| def method(s): | |
| # decorator | |
| assert issubclass(s, symbol_base) | |
| def bind(fn): | |
| setattr(s, fn.__name__, fn) | |
| return bind | |
| # python expression syntax | |
| symbol("lambda", 20) | |
| symbol("if", 20); symbol("else") # ternary form | |
| infix_r("or", 30); infix_r("and", 40); prefix("not", 50) | |
| infix("in", 60); infix("not", 60) # not in | |
| infix("is", 60); | |
| infix("<", 60); infix("<=", 60) | |
| infix(">", 60); infix(">=", 60) | |
| infix("<>", 60); infix("!=", 60); infix("==", 60) | |
| infix("|", 70); infix("^", 80); infix("&", 90) | |
| infix("<<", 100); infix(">>", 100) | |
| infix("+", 110); infix("-", 110) | |
| infix("*", 120); infix("/", 120); infix("//", 120) | |
| infix("%", 120) | |
| prefix("-", 130); prefix("+", 130); prefix("~", 130) | |
| infix_r("**", 140) | |
| symbol(".", 150); symbol("[", 150); symbol("(", 150) | |
| # additional behaviour | |
| symbol("(name)").nud = lambda self: self | |
| symbol("(literal)").nud = lambda self: self | |
| symbol("(end)") | |
| symbol(")") | |
| @method(symbol("(")) | |
| def nud(self): | |
| # parenthesized form; replaced by tuple former below | |
| expr = expression() | |
| advance(")") | |
| return expr | |
| symbol("else") | |
| @method(symbol("if")) | |
| def led(self, left): | |
| self.first = left | |
| self.second = expression() | |
| advance("else") | |
| self.third = expression() | |
| return self | |
| @method(symbol(".")) | |
| def led(self, left): | |
| if token.id != "(name)": | |
| SyntaxError("Expected an attribute name.") | |
| self.first = left | |
| self.second = token | |
| advance() | |
| return self | |
| symbol("]") | |
| @method(symbol("[")) | |
| def led(self, left): | |
| self.first = left | |
| self.second = expression() | |
| advance("]") | |
| return self | |
| symbol(")"); symbol(",") | |
| @method(symbol("(")) | |
| def led(self, left): | |
| self.first = left | |
| self.second = [] | |
| if token.id != ")": | |
| while 1: | |
| self.second.append(expression()) | |
| if token.id != ",": | |
| break | |
| advance(",") | |
| advance(")") | |
| return self | |
| symbol(":"); symbol("=") | |
| @method(symbol("lambda")) | |
| def nud(self): | |
| self.first = [] | |
| if token.id != ":": | |
| argument_list(self.first) | |
| advance(":") | |
| self.second = expression() | |
| return self | |
| def argument_list(list): | |
| while 1: | |
| if token.id != "(name)": | |
| SyntaxError("Expected an argument name.") | |
| list.append(token) | |
| advance() | |
| if token.id == "=": | |
| advance() | |
| list.append(expression()) | |
| else: | |
| list.append(None) | |
| if token.id != ",": | |
| break | |
| advance(",") | |
| # constants | |
| def constant(id): | |
| @method(symbol(id)) | |
| def nud(self): | |
| self.id = "(literal)" | |
| self.value = id | |
| return self | |
| constant("None") | |
| constant("True") | |
| constant("False") | |
| # multitoken operators | |
| @method(symbol("not")) | |
| def led(self, left): | |
| if token.id != "in": | |
| raise SyntaxError("Invalid syntax") | |
| advance() | |
| self.id = "not in" | |
| self.first = left | |
| self.second = expression(60) | |
| return self | |
| @method(symbol("is")) | |
| def led(self, left): | |
| if token.id == "not": | |
| advance() | |
| self.id = "is not" | |
| self.first = left | |
| self.second = expression(60) | |
| return self | |
| # displays | |
| @method(symbol("(")) | |
| def nud(self): | |
| self.first = [] | |
| comma = False | |
| if token.id != ")": | |
| while 1: | |
| if token.id == ")": | |
| break | |
| self.first.append(expression()) | |
| if token.id != ",": | |
| break | |
| comma = True | |
| advance(",") | |
| advance(")") | |
| if not self.first or comma: | |
| return self # tuple | |
| else: | |
| return self.first[0] | |
| symbol("]") | |
| @method(symbol("[")) | |
| def nud(self): | |
| self.first = [] | |
| if token.id != "]": | |
| while 1: | |
| if token.id == "]": | |
| break | |
| self.first.append(expression()) | |
| if token.id != ",": | |
| break | |
| advance(",") | |
| advance("]") | |
| return self | |
| symbol("}") | |
| @method(symbol("{")) | |
| def nud(self): | |
| self.first = [] | |
| if token.id != "}": | |
| while 1: | |
| if token.id == "}": | |
| break | |
| self.first.append(expression()) | |
| advance(":") | |
| self.first.append(expression()) | |
| if token.id != ",": | |
| break | |
| advance(",") | |
| advance("}") | |
| return self | |
| # python tokenizer | |
| def tokenize_python(program): | |
| import tokenize | |
| from cStringIO import StringIO | |
| type_map = { | |
| tokenize.NUMBER: "(literal)", | |
| tokenize.STRING: "(literal)", | |
| tokenize.OP: "(operator)", | |
| tokenize.NAME: "(name)", | |
| } | |
| for t in tokenize.generate_tokens(StringIO(program).next): | |
| try: | |
| yield type_map[t[0]], t[1] | |
| except KeyError: | |
| if t[0] == tokenize.NL: | |
| continue | |
| if t[0] == tokenize.ENDMARKER: | |
| break | |
| else: | |
| raise SyntaxError("Syntax error") | |
| yield "(end)", "(end)" | |
| def tokenize(program): | |
| if isinstance(program, list): | |
| source = program | |
| else: | |
| source = tokenize_python(program) | |
| for id, value in source: | |
| if id == "(literal)": | |
| symbol = symbol_table[id] | |
| s = symbol() | |
| s.value = value | |
| else: | |
| # name or operator | |
| symbol = symbol_table.get(value) | |
| if symbol: | |
| s = symbol() | |
| elif id == "(name)": | |
| symbol = symbol_table[id] | |
| s = symbol() | |
| s.value = value | |
| else: | |
| raise SyntaxError("Unknown operator (%r)" % id) | |
| yield s | |
| # parser engine | |
| def expression(rbp=0): | |
| global token | |
| t = token | |
| token = next() | |
| left = t.nud() | |
| while rbp < token.lbp: | |
| t = token | |
| token = next() | |
| left = t.led(left) | |
| return left | |
| def parse(program): | |
| global token, next | |
| next = tokenize(program).next | |
| token = next() | |
| return expression() | |
| def test(program): | |
| print ">>>", program | |
| print parse(program) | |
| # taken from the python FAQ | |
| program = """(lambda Ru,Ro,Iu,Io,IM,Sx,Sy:reduce(lambda x,y:x+y,map(lambda y,Iu=Iu,Io=Io,Ru=Ru,Ro=Ro,Sy=Sy,L=lambda yc,Iu=Iu,Io=Io,Ru=Ru,Ro=Ro,i=IM,Sx=Sx,Sy=Sy:reduce(lambda x,y:x+y,map(lambda x,xc=Ru,yc=yc,Ru=Ru,Ro=Ro,i=i,Sx=Sx,F=lambda xc,yc,x,y,k,f=lambda xc,yc,x,y,k,f:(k<=0)or (x*x+y*y>=4.0) or 1+f(xc,yc,x*x-y*y+xc,2.0*x*y+yc,k-1,f):f(xc,yc,x,y,k,f):chr(64+F(Ru+x*(Ro-Ru)/Sx,yc,0,0,i)),range(Sx))):L(Iu+y*(Io-Iu)/Sy),range(Sy))))(-2.1, 0.7, -1.2, 1.2, 30, 80, 24)""" | |
| # program = program + "+" + program | |
| # program = program + "+" + program | |
| # program = program + "+" + program | |
| if "--benchmark" in sys.argv: | |
| def custom_tokenize_python(program): | |
| # simplified tokenizer for this expression | |
| pattern = r"\s*(?:(<=|>=|\W)|([a-zA-Z]\w*)|(\d+(?:\.\d*)?))" | |
| for operator, name, literal in re.findall(pattern, program): | |
| if operator: | |
| yield "(operator)", operator | |
| elif name: | |
| yield "(name)", name | |
| elif literal: | |
| yield "(literal)", literal | |
| else: | |
| raise SyntaxError | |
| yield "(end)", "(end)" | |
| import time | |
| print len(program), "bytes" | |
| print len(list(tokenize(program))), "tokens" | |
| def bench(name, func): | |
| t0 = time.clock() | |
| for i in xrange(1000): | |
| func(program) | |
| print name, time.clock() - t0 | |
| import parser, compiler | |
| program_list = list(tokenize_python(program)) | |
| bench("topdown", parse) | |
| bench("topdown pretokenized", lambda program: parse(program_list)) | |
| tokenize_python = custom_tokenize_python | |
| bench("custom topdown", parse) | |
| if pytoken: | |
| tokenize_python = pytoken.token_list | |
| bench("built-in topdown", parse) | |
| bench("built-in compile", lambda program: compile(program, "", "eval")) | |
| bench("parser.parse", lambda program: parser.st2tuple(parser.expr(program))) | |
| bench("compiler.parse", lambda program: compiler.parse(program, "eval")) | |
| bench("compiler.compile", lambda program: compiler.compile(program, "", "eval")) | |
| sys.exit(0) | |
| # samples | |
| test("1") | |
| test("+1") | |
| test("-1") | |
| test("1+2") | |
| test("1+2+3") | |
| test("1+2*3") | |
| test("(1+2)*3") | |
| test("()") | |
| test("(1)") | |
| test("(1,)") | |
| test("(1, 2)") | |
| test("[1, 2, 3]") | |
| test("{}") | |
| test("{1: 'one', 2: 'two'}") | |
| test("1.0*2+3") | |
| test("'hello'+'world'") | |
| test("2**3**4") | |
| test("1 and 2") | |
| test("foo.bar") | |
| test("1 + hello") | |
| test("1 if 2 else 3") | |
| test("'hello'[0]") | |
| test("hello()") | |
| test("hello(1,2,3)") | |
| test("lambda: 1") | |
| test("lambda a, b, c: a+b+c") | |
| test("True") | |
| test("True or False") | |
| test("1 in 2") | |
| test("1 not in 2") | |
| test("1 is 2") | |
| test("1 is not 2") | |
| test("1 is (not 2)") | |
| print list(tokenize("1 not in 2")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #----------------------------------------------- | |
| # Precedence climbing expression parser. | |
| # | |
| # Eli Bendersky ([email protected]) | |
| # License: this code is in the public domain | |
| # Last modified: July 2012 | |
| # | |
| # http://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing/ | |
| #----------------------------------------------- | |
| from collections import namedtuple | |
| import re | |
| Tok = namedtuple('Tok', 'name value') | |
| class Tokenizer(object): | |
| """ Simple tokenizer object. The cur_token attribute holds the current | |
| token (Tok). Call get_next_token() to advance to the | |
| next token. cur_token is None before the first token is | |
| taken and after the source ends. | |
| """ | |
| TOKPATTERN = re.compile("\s*(?:(\d+)|(.))") | |
| def __init__(self, source): | |
| self._tokgen = self._gen_tokens(source) | |
| self.cur_token = None | |
| def get_next_token(self): | |
| """ Advance to the next token, and return it. | |
| """ | |
| try: | |
| self.cur_token = self._tokgen.next() | |
| except StopIteration: | |
| self.cur_token = None | |
| return self.cur_token | |
| def _gen_tokens(self, source): | |
| for number, operator in self.TOKPATTERN.findall(source): | |
| if number: | |
| yield Tok('NUMBER', number) | |
| elif operator == '(': | |
| yield Tok('LEFTPAREN', '(') | |
| elif operator == ')': | |
| yield Tok('RIGHTPAREN', ')') | |
| else: | |
| yield Tok('BINOP', operator) | |
| def __repr__(self): | |
| return 'Tokenizer(cur_token=%s)' % str(self.cur_token) | |
| # For each operator, a (precedence, associativity) pair. | |
| OpInfo = namedtuple('OpInfo', 'prec assoc') | |
| OPINFO_MAP = { | |
| '+': OpInfo(1, 'LEFT'), | |
| '-': OpInfo(1, 'LEFT'), | |
| '*': OpInfo(2, 'LEFT'), | |
| '/': OpInfo(2, 'LEFT'), | |
| '^': OpInfo(3, 'RIGHT'), | |
| } | |
| def parse_error(msg): | |
| raise RuntimeError(msg) | |
| from eblib.tracer import TraceCalls | |
| @TraceCalls(show_ret=True) | |
| def compute_atom(tokenizer): | |
| tok = tokenizer.cur_token | |
| if tok.name == 'LEFTPAREN': | |
| tokenizer.get_next_token() | |
| val = compute_expr(tokenizer, 1) | |
| if tokenizer.cur_token.name != 'RIGHTPAREN': | |
| parse_error('unmatched "("') | |
| tokenizer.get_next_token() | |
| return val | |
| elif tok is None: | |
| parse_error('source ended unexpectedly') | |
| elif tok.name == 'BINOP': | |
| parse_error('expected an atom, not an operator "%s"' % tok.value) | |
| else: | |
| assert tok.name == 'NUMBER' | |
| tokenizer.get_next_token() | |
| return int(tok.value) | |
| @TraceCalls(show_ret=True) | |
| def compute_expr(tokenizer, min_prec): | |
| atom_lhs = compute_atom(tokenizer) | |
| while True: | |
| cur = tokenizer.cur_token | |
| if (cur is None or cur.name != 'BINOP' | |
| or OPINFO_MAP[cur.value].prec < min_prec): | |
| break | |
| # Inside this loop the current token is a binary operator | |
| assert cur.name == 'BINOP' | |
| # Get the operator's precedence and associativity, and compute a | |
| # minimal precedence for the recursive call | |
| op = cur.value | |
| prec, assoc = OPINFO_MAP[op] | |
| next_min_prec = prec + 1 if assoc == 'LEFT' else prec | |
| # Consume the current token and prepare the next one for the | |
| # recursive call | |
| tokenizer.get_next_token() | |
| atom_rhs = compute_expr(tokenizer, next_min_prec) | |
| # Update lhs with the new value | |
| atom_lhs = compute_op(op, atom_lhs, atom_rhs) | |
| return atom_lhs | |
| def compute_op(op, lhs, rhs): | |
| lhs = int(lhs); rhs = int(rhs) | |
| if op == '+': return lhs + rhs | |
| elif op == '-': return lhs - rhs | |
| elif op == '*': return lhs * rhs | |
| elif op == '/': return lhs / rhs | |
| elif op == '^': return lhs ** rhs | |
| else: | |
| parse_error('unknown operator "%s"' % op) | |
| def test(): | |
| def compute(s): | |
| t = Tokenizer(s) | |
| t.get_next_token() | |
| return compute_expr(t, 1) | |
| assert compute('1 + 2 * 3') == 7 | |
| assert compute('7 - 9 * (2 - 3)') == 16 | |
| assert compute('2 * 3 * 4') == 24 | |
| assert compute('2 ^ 3 ^ 4') == 2 ** (3 ** 4) | |
| assert compute('(2 ^ 3) ^ 4') == 4096 | |
| assert compute('5') == 5 | |
| assert compute('4 + 2') == 6 | |
| assert compute('9 - 8 - 7') == -6 | |
| assert compute('9 - (8 - 7)') == 8 | |
| assert compute('(9 - 8) - 7') == -6 | |
| assert compute('2 + 3 ^ 2 * 3 + 4') == 33 | |
| if __name__ == '__main__': | |
| #test() | |
| t = Tokenizer('2 + 3^2*3 + 4') | |
| t.get_next_token() | |
| print compute_expr(t, min_prec=1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment