Skip to content

Instantly share code, notes, and snippets.

@divergentdave
Last active May 26, 2020 13:30
Show Gist options
  • Save divergentdave/a010337b837611363efbab458a40727f to your computer and use it in GitHub Desktop.
Save divergentdave/a010337b837611363efbab458a40727f to your computer and use it in GitHub Desktop.
Inheritance with PLY (Python Lex-Yacc)
from __future__ import print_function
import a
import b
def main():
aparser = a.AParser()
s1 = "10 + (2 + (13) + 8)"
print(s1)
print(aparser.parse(s1))
print()
bparser = b.BParser()
s2 = "100 - 15 - 3"
print(s2)
print(bparser.parse(s2))
print()
if __name__ == "__main__":
main()

This Gist demonstrates the use of inheritance with PLY parsers. Here, BaseLexer and BaseParser are defined in the module base.py. ALexer and AParser, in a.py, and BLexer and BParser, in b.py, inherit from BaseLexer and BaseParser, extending the list of tokens and the parser rules. As a toy example, these parsers evaluate a limited subset of arithmetic expressions. base.py contains rules for numbers and parentheses, a.py adds rules for addition, and b.py adds rules for subtraction.

There are several key ingredients needed to make this work well. First, as described in the PLY documentation, each lexer needs to be in a different Python module, i.e. in a different .py file, to avoid confusing PLY's error checking. Next, the lexers and parsers are all defined from class instances. As described in the documentation, this is accomplished by calling ply.lex.lex() and ply.yacc.yacc() with the module keyword argument set to the class instance. The BaseLexer class additionally implements __iter__(), token(), and input() methods, so that its instances can be used directly in the parser. Furthermore, a different tabmodule argument is passed to ply.yacc.yacc() in each file, so that the different parsers don't clobber each other's cached tables.

With all this scaffolding in place, we create subclasses of base.BaseLexer and base.BaseParser. In ALexer and BLexer, we set the tokens class-level variable by adding base.BaseLexer.tokens and a list of new tokens. When PLY sets up the lexers and parsers for the subclasses, it does so using dir(), so it sees all tokens and parser rules from both the subclass and the parent class. From here on out, everything should work as expected.

In this example, ply.yacc.yacc() is also called with start set, to specify which grammar symbol is returned at the top level. Additionally, t_newline(), t_error(), and p_error() are set up to track line numbers and provide verbose error messages, including locations in the file.

import ply.lex
import ply.yacc
import base
class ALexer(base.BaseLexer):
def __init__(self):
self.lexer = ply.lex.lex(module=self)
self.lexer.linepos = 0
tokens = base.BaseLexer.tokens + ["PLUS"]
t_PLUS = "\\+"
class AParser(base.BaseParser):
tokens = ALexer.tokens
def __init__(self):
self.lexer = ALexer()
self.parser = make_parser(self)
def p_expression_add(self, p):
"expression : expression PLUS expression_factor"
p[0] = p[1] + p[3]
def make_parser(mod):
return ply.yacc.yacc(module=mod,
start="expression",
tabmodule="aparsetab",
outputdir=".")
if __name__ == "__main__":
make_parser(AParser)
import ply.lex
import ply.yacc
import base
class BLexer(base.BaseLexer):
def __init__(self):
self.lexer = ply.lex.lex(module=self)
self.lexer.linepos = 0
tokens = base.BaseLexer.tokens + ["MINUS"]
t_MINUS = "-"
class BParser(base.BaseParser):
tokens = BLexer.tokens
def __init__(self):
self.lexer = BLexer()
self.parser = make_parser(self)
def p_expression_subtract(self, p):
"expression : expression MINUS expression_factor"
p[0] = p[1] - p[3]
def make_parser(mod):
return ply.yacc.yacc(module=mod,
start="expression",
tabmodule="bparsetab",
outputdir=".")
if __name__ == "__main__":
make_parser(BParser)
import ply.lex
import ply.yacc
class BaseLexer(object):
def __init__(self):
self.lexer = ply.lex.lex(module=self)
self.lexer.linestart = 0
def __iter__(self):
return iter(self.lexer)
def token(self):
return self.lexer.token()
def input(self, data):
self.lexer.input(data)
tokens = [
"IMMEDIATE",
"LPAREN",
"RPAREN",
]
t_IMMEDIATE = "[0-9]+"
t_LPAREN = "\\("
t_RPAREN = "\\)"
t_ignore = " "
def t_newline(self, t):
"\\n+"
t.lexer.lineno += len(t.value)
t.lexer.linestart = t.lexer.lexpos
def t_error(self, t):
raise Exception("Illegal character '%s' on line %d, column %d" %
(t.value[0],
t.lexer.lineno,
t.lexer.lexpos - t.lexer.linestart + 1))
class BaseParser(object):
tokens = BaseLexer.tokens
def __init__(self):
self.lexer = BaseLexer()
self.parser = make_parser(self)
def p_expression(self, p):
"expression : expression_factor"
p[0] = p[1]
def p_expression_factor_immediate(self, p):
"expression_factor : IMMEDIATE"
p[0] = int(p[1])
def p_expression_factor_parens(self, p):
"expression_factor : LPAREN expression RPAREN"
p[0] = p[2]
def p_error(self, p):
if p:
stack_state_str = " ".join([symbol.type for symbol
in self.parser.symstack[1:]])
raise Exception("Syntax error at '%s', type %s, on line %d\n"
"Parser state: %s %s . %s" %
(p.value, p.type, p.lineno,
self.parser.state, stack_state_str, p))
else:
raise Exception("Syntax error at EOF")
def parse(self, text):
return self.parser.parse(text, self.lexer)
def make_parser(mod):
return ply.yacc.yacc(module=mod,
start="expression",
tabmodule="baseparsetab",
outputdir=".")
if __name__ == "__main__":
make_parser(BaseParser)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment