Skip to content

Instantly share code, notes, and snippets.

@zahlenteufel
Created April 9, 2014 07:38
Show Gist options
  • Save zahlenteufel/10236760 to your computer and use it in GitHub Desktop.
Save zahlenteufel/10236760 to your computer and use it in GitHub Desktop.
Earley Algorithm for Non-Deterministic CFG Parsing in Python

#Earley Algorithm for Non-Deterministic CFG Parsing in Python

$ python earley.py "3+2*1+4"

Abstract Syntax Tree for 3+2*1+4

in case it can parse the input, it outputs the Abstract Syntax Tree:

\Tree [.G [.S [.S [.S [.M [.T [.3 ]]]] [.+ ] [.M [.M [.T [.2 ]]] [.* ] [.T [.1 ]]]] [.+ ] [.M [.T [.4 ]]]] [.$ ]]

otherwise it will output an error message through Standard Error.

The output is in QTree format. To use it you have to install the LaTeX package:

$ sudo apt-get install texlive-humanities

and then include it in some tex file:

\documentclass{article}
\pagestyle{empty}
\usepackage{qtree}
\begin{document}
\Tree [.G [.S [.S [.S [.M [.T [.3 ]]]] [.+ ] [.M [.M [.T [.2 ]]] [.* ] [.T [.1 ]]]] [.+ ] [.M [.T [.4 ]]]] [.\$ ]]
\end{document}

and for example you can compile it, render it and create an image:

$ pdflatex ast.tex && pdfcrop --margins 10 ast.pdf ast.pdf && convert -density 150 ast.pdf ast.png
# -*- coding: utf-8 -*-
#!/usr/bin/python
import sys
class Production:
def __init__(self, N, RHS):
self.N = N
self.RHS = RHS
def symbols(self):
return [s for s in self.RHS]
def __str__(self):
return self.N+" → "+self.RHS
class State:
def __init__(self, N, S1, S2, i, ast=None):
self.N = N
self.S1 = S1
self.S2 = S2
self.i = i
if not ast:
ast = AST(self.N, [])
self.ast = ast
def complete(self):
return len(self.S2)==0
def nextcat(self):
return self.S2[0]
def advance(self):
return State(self.N, self.S1+self.S2[0], self.S2[1:], self.i, AST(self.N, self.ast.children[:]))
def __str__(self):
return "[%s → %s•%s, %d]" % (self.N, self.S1, self.S2, self.i)
def __eq__(self, s2):
return self.N==s2.N and self.S1==s2.S1 and self.S2==s2.S2 and self.i==s2.i
class AST:
def __init__(self, label, children):
self.label = label
self.children = children
def __str__(self, first=False):
s = ""
if first:
s += "\Tree "
s += "[.%s %s]" % (self.label, " ".join(map(str, self.children)))
return s
class Grammar:
def __init__(self, N, T, P, S):
assert(S in N and all(s in T or s in N for p in P for s in p.symbols()) and all(p.N in N for p in P))
self.N = N
self.T = T
self.P = P
self.S = S
def __str__(self):
return "\n".join(map(str, self.P))+"\n"
def parse(self, string):
# Earley
string = string+"$"
i = 0
n = len(string)+1
predicted = [[] for i in xrange(n+1)]
predicted[0] = [State("G", "", "S$", 0)]
for i in xrange(n+1):
j = 0
while j < len(predicted[i]):
state = predicted[i][j]
if not state.complete():
c = state.nextcat()
if c in self.N:
self.predict(state, predicted[i], i)
else:
self.scan(state, string[i:], predicted[i+1], i)
else:
self.complete(state, predicted, i)
j += 1
expected = State("G","S$","",0)
for p in predicted[n-1]:
if p == expected:
return p.ast
return None
def predict(self, state, predicted, i):
n = state.nextcat()
for production in self.P:
if production.N == n:
st = State(n, "", production.RHS, i)
if st not in predicted:
predicted.append(State(n, "", production.RHS, i))
def scan(self, state, string, predicted, i):
if string[0] == state.nextcat():
s2 = state.advance()
if s2 not in predicted:
s2.ast.children.append(AST(string[0],[]))
predicted.append(s2)
def complete(self, completedState, predicted, k):
A = completedState.N
j = completedState.i
# completedState = A -> y•, j
for state in predicted[j]:
if not state.complete() and state.nextcat() == A:
# state = B -> x•Az, i
s2 = state.advance()
# s2 = B -> xA•z, i
if s2 not in predicted[k]:
s2.ast.children.append(completedState.ast)
predicted[k].append(s2)
def main():
# S -> S + M | M
# M = M * T | T
# T = 1 | 2 | 3 | 4
S0 = Production("G", "S")
S1 = Production("S", "S+M")
S2 = Production("S", "M")
M1 = Production("M", "M*T")
M2 = Production("M", "T")
T1 = Production("T", "1")
T2 = Production("T", "2")
T3 = Production("T", "3")
T4 = Production("T", "4")
G = Grammar("GSMT", "1234*+$", [S0,S1,S2,M1,M2,T1,T2,T3,T4], "G")
# print G
s = "" if len(sys.argv) == 1 else sys.argv[1]
ast = G.parse(s)
if ast:
print ast.__str__(True).replace("$", "\$")
else:
print >> sys.stderr, "parse error: the string doesn't belong to L(G)"
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment