Skip to content

Instantly share code, notes, and snippets.

@whym
Created April 21, 2012 13:03
Show Gist options
  • Save whym/2436998 to your computer and use it in GitHub Desktop.
Save whym/2436998 to your computer and use it in GitHub Desktop.
Full parsing with chart parser: Given a tokenized input sentence and a set of grammar rules, parse() generates all possible trees.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Full parsing with chart parser:
# Given a tokenized input sentence and a set of grammar rules,
# parse() generates all possible trees.
from collections import defaultdict
from Queue import Queue
class Edge:
def __init__(self, start, end, word, rule, active=True, dot=None, children=[]):
self.label = rule.lhs
self.start = start
self.end = end
self.word = word
self.rule = rule
self.children = children
if active:
self.dot = 0
else:
self.dot = len(rule.rhs)
if dot:
self.dot = dot
def dotted(self):
return self.rule.rhs[self.dot]
def active(self):
return self.dot < len(self.rule.rhs)
def __repr__(self):
return "[%2d:%2d, %s/%s %s@%d] %s" % (self.start, self.end, self.word, self.label, self.rule, self.dot, self.children)
def __hash__(self):
if self.active():
return hash((self.label, self.start, self.end, self.rule, self.dot))
else:
return hash((self.label, self.start, self.end))
class Rule:
def __init__(self, lhs, rhs):
self.lhs = lhs
self.rhs = tuple(rhs)
def __repr__(self): return "{%s => %s}" % (self.lhs, self.rhs)
def __hash__(self): return hash((self.lhs, self.rhs))
class Grammar:
def __init__(self, rules):
self.rules = [Rule(x[0], x[1]) for x in rules]
def __repr__(self): return relr(self.rules)
def __iter__(self): return iter(self.rules)
def __len__(self): return len(self.rules)
class EdgeSet:
def __init__(self, ls=[]):
self.edges = set(ls)
def add(self, e):
self.edges.add(e)
def remove(self, e):
self.edges.remove(e)
def lookup_incomplete_ending_at(self, end, label):
for e in self.edges:
if e.active() and e.end == end and e.rule.rhs[e.dot] == label:
yield e
def lookup_complete_starting_at(self, start, label):
for e in self.edges:
if not e.active() and e.start == start and e.lhs == label:
yield e
def __repr__(self): return repr(self.edges)
def __iter__(self): return iter(self.edges)
def __len__(self): return len(self.edges)
def parse(tokens, rules):
def bracket(ls, left='(', right=')'):
ls = [x for x in ls if x != '']
if len(ls) == 1:
return ls[0]
else:
return left + (' '.join(ls)) + right
def edge2tree(edge):
if edge.active():
return bracket([edge2tree(x) for x in edge.children], '', '')
elif len(edge.children) == 0:
return bracket([edge.label, edge.word])
else:
return bracket([edge.label] + [edge2tree(x) for x in edge.children])
grammar = Grammar(rules)
ia_queue = Queue()
chart = EdgeSet()
# scan (initialize)
for (i,(word,lab)) in enumerate(tokens):
e = Edge(i, i+1, word, Rule(lab, [word]), active=False)
ia_queue.put(e)
chart.add(e)
for rule in grammar:
chart.add(Edge(i, i, '', rule, dot=0))
while ia_queue.qsize() > 0:
edge = ia_queue.get()
added = EdgeSet()
# create new edges by combining neighboring edges in the chart
if not edge.active():
for f in chart.lookup_incomplete_ending_at(edge.start, edge.label):
added.add(Edge(f.start, edge.end, bracket([f.word, edge.word], '', ''), f.rule, dot=f.dot+1, children=[f, edge]))
else:
for f in chart.lookup_complete_starting_at(edge.start, edge.label):
added.add(Edge(edge.start, f.end, bracket([edge.word, f.word], '', ''), edge.rule, dot=edge.dot+1, children=[edge, f]))
# reduce: remove redundant complete edges with the same span and label
radded = {}
for e in added:
if not e.active():
radded[(e.start, e.end, e.label)] = e
else:
radded[(e.start, e.end, e.rule, e.dot)] = e
added = EdgeSet(radded.values())
for e in added:
if not e.active():
ia_queue.put(e)
for e in added:
chart.add(e)
for x in chart:
if x.start == 0 and x.end == len(tokens) and not x.active():
yield edge2tree(x)
if __name__ == '__main__':
NOUN = 'N'
VERB = 'V'
PREP = 'P'
NP = 'NP'
VP = 'VP'
PP = 'PP'
S = 'S'
for tree in parse([('John', NOUN),
('saw', VERB),
('Mary', NOUN),
('with', PREP),
('telescope', NOUN),
],
[(S, [NP, VP]),
(S, [NP, VP, PP]),
(NP, [NOUN]),
(VP, [VERB]),
(PP, [PREP, NP]),
(NP, [NP, PP]),
(VP, [VP, NP]),
(VP, [VP, NP]),
(VP, [VP, NP, PP]),
]):
print tree
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment