Created
April 21, 2012 13:03
-
-
Save whym/2436998 to your computer and use it in GitHub Desktop.
Full parsing with chart parser: Given a tokenized input sentence and a set of grammar rules, parse() generates all possible trees.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Full parsing with chart parser: | |
# Given a tokenized input sentence and a set of grammar rules, | |
# parse() generates all possible trees. | |
from collections import defaultdict | |
from Queue import Queue | |
class Edge: | |
def __init__(self, start, end, word, rule, active=True, dot=None, children=[]): | |
self.label = rule.lhs | |
self.start = start | |
self.end = end | |
self.word = word | |
self.rule = rule | |
self.children = children | |
if active: | |
self.dot = 0 | |
else: | |
self.dot = len(rule.rhs) | |
if dot: | |
self.dot = dot | |
def dotted(self): | |
return self.rule.rhs[self.dot] | |
def active(self): | |
return self.dot < len(self.rule.rhs) | |
def __repr__(self): | |
return "[%2d:%2d, %s/%s %s@%d] %s" % (self.start, self.end, self.word, self.label, self.rule, self.dot, self.children) | |
def __hash__(self): | |
if self.active(): | |
return hash((self.label, self.start, self.end, self.rule, self.dot)) | |
else: | |
return hash((self.label, self.start, self.end)) | |
class Rule: | |
def __init__(self, lhs, rhs): | |
self.lhs = lhs | |
self.rhs = tuple(rhs) | |
def __repr__(self): return "{%s => %s}" % (self.lhs, self.rhs) | |
def __hash__(self): return hash((self.lhs, self.rhs)) | |
class Grammar: | |
def __init__(self, rules): | |
self.rules = [Rule(x[0], x[1]) for x in rules] | |
def __repr__(self): return relr(self.rules) | |
def __iter__(self): return iter(self.rules) | |
def __len__(self): return len(self.rules) | |
class EdgeSet: | |
def __init__(self, ls=[]): | |
self.edges = set(ls) | |
def add(self, e): | |
self.edges.add(e) | |
def remove(self, e): | |
self.edges.remove(e) | |
def lookup_incomplete_ending_at(self, end, label): | |
for e in self.edges: | |
if e.active() and e.end == end and e.rule.rhs[e.dot] == label: | |
yield e | |
def lookup_complete_starting_at(self, start, label): | |
for e in self.edges: | |
if not e.active() and e.start == start and e.lhs == label: | |
yield e | |
def __repr__(self): return repr(self.edges) | |
def __iter__(self): return iter(self.edges) | |
def __len__(self): return len(self.edges) | |
def parse(tokens, rules): | |
def bracket(ls, left='(', right=')'): | |
ls = [x for x in ls if x != ''] | |
if len(ls) == 1: | |
return ls[0] | |
else: | |
return left + (' '.join(ls)) + right | |
def edge2tree(edge): | |
if edge.active(): | |
return bracket([edge2tree(x) for x in edge.children], '', '') | |
elif len(edge.children) == 0: | |
return bracket([edge.label, edge.word]) | |
else: | |
return bracket([edge.label] + [edge2tree(x) for x in edge.children]) | |
grammar = Grammar(rules) | |
ia_queue = Queue() | |
chart = EdgeSet() | |
# scan (initialize) | |
for (i,(word,lab)) in enumerate(tokens): | |
e = Edge(i, i+1, word, Rule(lab, [word]), active=False) | |
ia_queue.put(e) | |
chart.add(e) | |
for rule in grammar: | |
chart.add(Edge(i, i, '', rule, dot=0)) | |
while ia_queue.qsize() > 0: | |
edge = ia_queue.get() | |
added = EdgeSet() | |
# create new edges by combining neighboring edges in the chart | |
if not edge.active(): | |
for f in chart.lookup_incomplete_ending_at(edge.start, edge.label): | |
added.add(Edge(f.start, edge.end, bracket([f.word, edge.word], '', ''), f.rule, dot=f.dot+1, children=[f, edge])) | |
else: | |
for f in chart.lookup_complete_starting_at(edge.start, edge.label): | |
added.add(Edge(edge.start, f.end, bracket([edge.word, f.word], '', ''), edge.rule, dot=edge.dot+1, children=[edge, f])) | |
# reduce: remove redundant complete edges with the same span and label | |
radded = {} | |
for e in added: | |
if not e.active(): | |
radded[(e.start, e.end, e.label)] = e | |
else: | |
radded[(e.start, e.end, e.rule, e.dot)] = e | |
added = EdgeSet(radded.values()) | |
for e in added: | |
if not e.active(): | |
ia_queue.put(e) | |
for e in added: | |
chart.add(e) | |
for x in chart: | |
if x.start == 0 and x.end == len(tokens) and not x.active(): | |
yield edge2tree(x) | |
if __name__ == '__main__': | |
NOUN = 'N' | |
VERB = 'V' | |
PREP = 'P' | |
NP = 'NP' | |
VP = 'VP' | |
PP = 'PP' | |
S = 'S' | |
for tree in parse([('John', NOUN), | |
('saw', VERB), | |
('Mary', NOUN), | |
('with', PREP), | |
('telescope', NOUN), | |
], | |
[(S, [NP, VP]), | |
(S, [NP, VP, PP]), | |
(NP, [NOUN]), | |
(VP, [VERB]), | |
(PP, [PREP, NP]), | |
(NP, [NP, PP]), | |
(VP, [VP, NP]), | |
(VP, [VP, NP]), | |
(VP, [VP, NP, PP]), | |
]): | |
print tree |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment