Skip to content

Instantly share code, notes, and snippets.

@andersonsp
Last active August 29, 2015 13:57
Show Gist options
  • Select an option

  • Save andersonsp/9733408 to your computer and use it in GitHub Desktop.

Select an option

Save andersonsp/9733408 to your computer and use it in GitHub Desktop.
Parser examples in python
# see http://effbot.org/zone/simple-top-down-parsing.htm
import sys
import re
try:
# test binding for python's built-in tokenizer; see
# http://svn.effbot.org/public/stuff/sandbox/pytoken
import pytoken
except ImportError:
pytoken = None
if 1:
# symbol (token type) registry
symbol_table = {}
class symbol_base(object):
id = None
value = None
first = second = third = None
def nud(self):
raise SyntaxError("Syntax error (%r)." % self.id)
def led(self, left):
raise SyntaxError("Unknown operator (%r)." % self.id)
def __repr__(self):
if self.id == "(name)" or self.id == "(literal)":
return "(%s %s)" % (self.id[1:-1], self.value)
out = [self.id, self.first, self.second, self.third]
out = map(str, filter(None, out))
return "(" + " ".join(out) + ")"
def symbol(id, bp=0):
try:
s = symbol_table[id]
except KeyError:
class s(symbol_base):
pass
s.__name__ = "symbol-" + id # for debugging
s.id = id
s.value = None
s.lbp = bp
symbol_table[id] = s
else:
s.lbp = max(bp, s.lbp)
return s
# helpers
def infix(id, bp):
def led(self, left):
self.first = left
self.second = expression(bp)
return self
symbol(id, bp).led = led
def infix_r(id, bp):
def led(self, left):
self.first = left
self.second = expression(bp-1)
return self
symbol(id, bp).led = led
def prefix(id, bp):
def nud(self):
self.first = expression(bp)
return self
symbol(id).nud = nud
def advance(id=None):
global token
if id and token.id != id:
raise SyntaxError("Expected %r" % id)
token = next()
def method(s):
# decorator
assert issubclass(s, symbol_base)
def bind(fn):
setattr(s, fn.__name__, fn)
return bind
# python expression syntax
symbol("lambda", 20)
symbol("if", 20); symbol("else") # ternary form
infix_r("or", 30); infix_r("and", 40); prefix("not", 50)
infix("in", 60); infix("not", 60) # not in
infix("is", 60);
infix("<", 60); infix("<=", 60)
infix(">", 60); infix(">=", 60)
infix("<>", 60); infix("!=", 60); infix("==", 60)
infix("|", 70); infix("^", 80); infix("&", 90)
infix("<<", 100); infix(">>", 100)
infix("+", 110); infix("-", 110)
infix("*", 120); infix("/", 120); infix("//", 120)
infix("%", 120)
prefix("-", 130); prefix("+", 130); prefix("~", 130)
infix_r("**", 140)
symbol(".", 150); symbol("[", 150); symbol("(", 150)
# additional behaviour
symbol("(name)").nud = lambda self: self
symbol("(literal)").nud = lambda self: self
symbol("(end)")
symbol(")")
@method(symbol("("))
def nud(self):
# parenthesized form; replaced by tuple former below
expr = expression()
advance(")")
return expr
symbol("else")
@method(symbol("if"))
def led(self, left):
self.first = left
self.second = expression()
advance("else")
self.third = expression()
return self
@method(symbol("."))
def led(self, left):
if token.id != "(name)":
SyntaxError("Expected an attribute name.")
self.first = left
self.second = token
advance()
return self
symbol("]")
@method(symbol("["))
def led(self, left):
self.first = left
self.second = expression()
advance("]")
return self
symbol(")"); symbol(",")
@method(symbol("("))
def led(self, left):
self.first = left
self.second = []
if token.id != ")":
while 1:
self.second.append(expression())
if token.id != ",":
break
advance(",")
advance(")")
return self
symbol(":"); symbol("=")
@method(symbol("lambda"))
def nud(self):
self.first = []
if token.id != ":":
argument_list(self.first)
advance(":")
self.second = expression()
return self
def argument_list(list):
while 1:
if token.id != "(name)":
SyntaxError("Expected an argument name.")
list.append(token)
advance()
if token.id == "=":
advance()
list.append(expression())
else:
list.append(None)
if token.id != ",":
break
advance(",")
# constants
def constant(id):
@method(symbol(id))
def nud(self):
self.id = "(literal)"
self.value = id
return self
constant("None")
constant("True")
constant("False")
# multitoken operators
@method(symbol("not"))
def led(self, left):
if token.id != "in":
raise SyntaxError("Invalid syntax")
advance()
self.id = "not in"
self.first = left
self.second = expression(60)
return self
@method(symbol("is"))
def led(self, left):
if token.id == "not":
advance()
self.id = "is not"
self.first = left
self.second = expression(60)
return self
# displays
@method(symbol("("))
def nud(self):
self.first = []
comma = False
if token.id != ")":
while 1:
if token.id == ")":
break
self.first.append(expression())
if token.id != ",":
break
comma = True
advance(",")
advance(")")
if not self.first or comma:
return self # tuple
else:
return self.first[0]
symbol("]")
@method(symbol("["))
def nud(self):
self.first = []
if token.id != "]":
while 1:
if token.id == "]":
break
self.first.append(expression())
if token.id != ",":
break
advance(",")
advance("]")
return self
symbol("}")
@method(symbol("{"))
def nud(self):
self.first = []
if token.id != "}":
while 1:
if token.id == "}":
break
self.first.append(expression())
advance(":")
self.first.append(expression())
if token.id != ",":
break
advance(",")
advance("}")
return self
# python tokenizer
def tokenize_python(program):
import tokenize
from cStringIO import StringIO
type_map = {
tokenize.NUMBER: "(literal)",
tokenize.STRING: "(literal)",
tokenize.OP: "(operator)",
tokenize.NAME: "(name)",
}
for t in tokenize.generate_tokens(StringIO(program).next):
try:
yield type_map[t[0]], t[1]
except KeyError:
if t[0] == tokenize.NL:
continue
if t[0] == tokenize.ENDMARKER:
break
else:
raise SyntaxError("Syntax error")
yield "(end)", "(end)"
def tokenize(program):
if isinstance(program, list):
source = program
else:
source = tokenize_python(program)
for id, value in source:
if id == "(literal)":
symbol = symbol_table[id]
s = symbol()
s.value = value
else:
# name or operator
symbol = symbol_table.get(value)
if symbol:
s = symbol()
elif id == "(name)":
symbol = symbol_table[id]
s = symbol()
s.value = value
else:
raise SyntaxError("Unknown operator (%r)" % id)
yield s
# parser engine
def expression(rbp=0):
global token
t = token
token = next()
left = t.nud()
while rbp < token.lbp:
t = token
token = next()
left = t.led(left)
return left
def parse(program):
global token, next
next = tokenize(program).next
token = next()
return expression()
def test(program):
print ">>>", program
print parse(program)
# taken from the python FAQ
program = """(lambda Ru,Ro,Iu,Io,IM,Sx,Sy:reduce(lambda x,y:x+y,map(lambda y,Iu=Iu,Io=Io,Ru=Ru,Ro=Ro,Sy=Sy,L=lambda yc,Iu=Iu,Io=Io,Ru=Ru,Ro=Ro,i=IM,Sx=Sx,Sy=Sy:reduce(lambda x,y:x+y,map(lambda x,xc=Ru,yc=yc,Ru=Ru,Ro=Ro,i=i,Sx=Sx,F=lambda xc,yc,x,y,k,f=lambda xc,yc,x,y,k,f:(k<=0)or (x*x+y*y>=4.0) or 1+f(xc,yc,x*x-y*y+xc,2.0*x*y+yc,k-1,f):f(xc,yc,x,y,k,f):chr(64+F(Ru+x*(Ro-Ru)/Sx,yc,0,0,i)),range(Sx))):L(Iu+y*(Io-Iu)/Sy),range(Sy))))(-2.1, 0.7, -1.2, 1.2, 30, 80, 24)"""
# program = program + "+" + program
# program = program + "+" + program
# program = program + "+" + program
if "--benchmark" in sys.argv:
def custom_tokenize_python(program):
# simplified tokenizer for this expression
pattern = r"\s*(?:(<=|>=|\W)|([a-zA-Z]\w*)|(\d+(?:\.\d*)?))"
for operator, name, literal in re.findall(pattern, program):
if operator:
yield "(operator)", operator
elif name:
yield "(name)", name
elif literal:
yield "(literal)", literal
else:
raise SyntaxError
yield "(end)", "(end)"
import time
print len(program), "bytes"
print len(list(tokenize(program))), "tokens"
def bench(name, func):
t0 = time.clock()
for i in xrange(1000):
func(program)
print name, time.clock() - t0
import parser, compiler
program_list = list(tokenize_python(program))
bench("topdown", parse)
bench("topdown pretokenized", lambda program: parse(program_list))
tokenize_python = custom_tokenize_python
bench("custom topdown", parse)
if pytoken:
tokenize_python = pytoken.token_list
bench("built-in topdown", parse)
print
bench("built-in compile", lambda program: compile(program, "", "eval"))
bench("parser.parse", lambda program: parser.st2tuple(parser.expr(program)))
print
bench("compiler.parse", lambda program: compiler.parse(program, "eval"))
bench("compiler.compile", lambda program: compiler.compile(program, "", "eval"))
sys.exit(0)
# samples
test("1")
test("+1")
test("-1")
test("1+2")
test("1+2+3")
test("1+2*3")
test("(1+2)*3")
test("()")
test("(1)")
test("(1,)")
test("(1, 2)")
test("[1, 2, 3]")
test("{}")
test("{1: 'one', 2: 'two'}")
test("1.0*2+3")
test("'hello'+'world'")
test("2**3**4")
test("1 and 2")
test("foo.bar")
test("1 + hello")
test("1 if 2 else 3")
test("'hello'[0]")
test("hello()")
test("hello(1,2,3)")
test("lambda: 1")
test("lambda a, b, c: a+b+c")
test("True")
test("True or False")
test("1 in 2")
test("1 not in 2")
test("1 is 2")
test("1 is not 2")
test("1 is (not 2)")
print
print list(tokenize("1 not in 2"))
#-----------------------------------------------
# Precedence climbing expression parser.
#
# Eli Bendersky ([email protected])
# License: this code is in the public domain
# Last modified: July 2012
#
# http://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing/
#-----------------------------------------------
from collections import namedtuple
import re
Tok = namedtuple('Tok', 'name value')
class Tokenizer(object):
""" Simple tokenizer object. The cur_token attribute holds the current
token (Tok). Call get_next_token() to advance to the
next token. cur_token is None before the first token is
taken and after the source ends.
"""
TOKPATTERN = re.compile("\s*(?:(\d+)|(.))")
def __init__(self, source):
self._tokgen = self._gen_tokens(source)
self.cur_token = None
def get_next_token(self):
""" Advance to the next token, and return it.
"""
try:
self.cur_token = self._tokgen.next()
except StopIteration:
self.cur_token = None
return self.cur_token
def _gen_tokens(self, source):
for number, operator in self.TOKPATTERN.findall(source):
if number:
yield Tok('NUMBER', number)
elif operator == '(':
yield Tok('LEFTPAREN', '(')
elif operator == ')':
yield Tok('RIGHTPAREN', ')')
else:
yield Tok('BINOP', operator)
def __repr__(self):
return 'Tokenizer(cur_token=%s)' % str(self.cur_token)
# For each operator, a (precedence, associativity) pair.
OpInfo = namedtuple('OpInfo', 'prec assoc')
OPINFO_MAP = {
'+': OpInfo(1, 'LEFT'),
'-': OpInfo(1, 'LEFT'),
'*': OpInfo(2, 'LEFT'),
'/': OpInfo(2, 'LEFT'),
'^': OpInfo(3, 'RIGHT'),
}
def parse_error(msg):
raise RuntimeError(msg)
from eblib.tracer import TraceCalls
@TraceCalls(show_ret=True)
def compute_atom(tokenizer):
tok = tokenizer.cur_token
if tok.name == 'LEFTPAREN':
tokenizer.get_next_token()
val = compute_expr(tokenizer, 1)
if tokenizer.cur_token.name != 'RIGHTPAREN':
parse_error('unmatched "("')
tokenizer.get_next_token()
return val
elif tok is None:
parse_error('source ended unexpectedly')
elif tok.name == 'BINOP':
parse_error('expected an atom, not an operator "%s"' % tok.value)
else:
assert tok.name == 'NUMBER'
tokenizer.get_next_token()
return int(tok.value)
@TraceCalls(show_ret=True)
def compute_expr(tokenizer, min_prec):
atom_lhs = compute_atom(tokenizer)
while True:
cur = tokenizer.cur_token
if (cur is None or cur.name != 'BINOP'
or OPINFO_MAP[cur.value].prec < min_prec):
break
# Inside this loop the current token is a binary operator
assert cur.name == 'BINOP'
# Get the operator's precedence and associativity, and compute a
# minimal precedence for the recursive call
op = cur.value
prec, assoc = OPINFO_MAP[op]
next_min_prec = prec + 1 if assoc == 'LEFT' else prec
# Consume the current token and prepare the next one for the
# recursive call
tokenizer.get_next_token()
atom_rhs = compute_expr(tokenizer, next_min_prec)
# Update lhs with the new value
atom_lhs = compute_op(op, atom_lhs, atom_rhs)
return atom_lhs
def compute_op(op, lhs, rhs):
lhs = int(lhs); rhs = int(rhs)
if op == '+': return lhs + rhs
elif op == '-': return lhs - rhs
elif op == '*': return lhs * rhs
elif op == '/': return lhs / rhs
elif op == '^': return lhs ** rhs
else:
parse_error('unknown operator "%s"' % op)
def test():
def compute(s):
t = Tokenizer(s)
t.get_next_token()
return compute_expr(t, 1)
assert compute('1 + 2 * 3') == 7
assert compute('7 - 9 * (2 - 3)') == 16
assert compute('2 * 3 * 4') == 24
assert compute('2 ^ 3 ^ 4') == 2 ** (3 ** 4)
assert compute('(2 ^ 3) ^ 4') == 4096
assert compute('5') == 5
assert compute('4 + 2') == 6
assert compute('9 - 8 - 7') == -6
assert compute('9 - (8 - 7)') == 8
assert compute('(9 - 8) - 7') == -6
assert compute('2 + 3 ^ 2 * 3 + 4') == 33
if __name__ == '__main__':
#test()
t = Tokenizer('2 + 3^2*3 + 4')
t.get_next_token()
print compute_expr(t, min_prec=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment