Created
October 25, 2014 11:18
-
-
Save xiaohan2012/81982c8ac2e2c5faa719 to your computer and use it in GitHub Desktop.
CYK parsing algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
The CYK parsing algorithm | |
""" | |
from collections import defaultdict | |
from itertools import product | |
import texttable #needs to be installed, can be accessed at http://foutaise.org/code/texttable/ | |
def _print_tabel(table, nrow, ncol): | |
"""Utility function to print to dynamic programming table""" | |
t = texttable.Texttable() | |
rows = [[" "] + map(str, xrange(1, ncol+1))] | |
for i in xrange(-1, nrow-1): | |
row_content = ['\n'.join(table[i][j]) for j in xrange(ncol)] | |
rows.append([str(i+1)] + row_content) | |
t.add_rows(rows) | |
print t.draw() + "\n" | |
def CYK_parse(words, rules): | |
""" | |
Params: | |
------ | |
words: list of words | |
rules: list of production rules indexed by the right hand side term | |
Return: | |
------ | |
Possible parses of the input words | |
""" | |
table = defaultdict(lambda : defaultdict(list)) #dynamic programming table | |
bp = defaultdict(lambda : defaultdict(lambda : defaultdict(list))) #back-pointer table for parse tree reconstruction | |
for j in xrange(len(words)): | |
for A in rules[tuple([words[j]])]: #all derivations, A that A -> words[j] | |
table[j-1][j].append(A) | |
bp[j-1][j][A].append(words[j]) | |
for i in xrange(j-2, -2, -1): | |
for k in xrange(i+1, j): | |
for B, C in product(table[i][k], table[k][j]): | |
for A in rules[(B, C)]: | |
table[i][j].append(A) | |
bp[i][j][A].append(((i,k,B), (k,j, C))) | |
_print_tabel(table, len(words), len(words)) | |
def recons(bp, i, j, sym): | |
""" | |
Reconstruct the parse tree | |
""" | |
trees = [] | |
if len(bp[i][j][sym]) == 1 and isinstance(bp[i][j][sym][0], str): # terminals | |
return [(sym, bp[i][j][sym][0])] | |
else: | |
for p1, p2 in bp[i][j][sym]: | |
i,k,B = p1 | |
k,j,C = p2 | |
for left_tree, right_tree in product(recons(bp, i, k, B), | |
recons(bp, k, j, C)): | |
trees.append((sym, left_tree, right_tree)) | |
return trees | |
return recons(bp, -1, len(words)-1, "S") | |
if __name__ == "__main__": | |
raw_rules = [ | |
("S", ("NP", "VP")), | |
("S", ("X1", "VP")), | |
("X1", ("AUX", "NP")), | |
("S", ("book",)), | |
("S", ("include",)), | |
("S", ("prefer",)), | |
("S", ("VERB", "NP")), | |
("S", ("X2", "PP")), | |
("S", ("VERB", "PP")), | |
("S", ("VP", "PP")), | |
("NP", ("I",)), | |
("NP", ("she",)), | |
("NP", ("me",)), | |
("NP", ("TWA",)), | |
("NP", ("Houston",)), | |
("NP", ("DET", "NOMINAL")), | |
("NOMINAL", ("book",)), | |
("NOMINAL", ("flight",)), | |
("NOMINAL", ("meal",)), | |
("NOMINAL", ("money",)), | |
("NOMINAL", ("NOMINAL", "NOUN")), | |
("NOMINAL", ("NOMINAL", "PP")), | |
# lexicons | |
("VP", ("book",)), | |
("VP", ("include",)), | |
("VP", ("prefer",)), | |
("VP", ("VERB", "NP")), | |
("VP", ("X2", "PP")), | |
("X2", ("VERB", "NP")), | |
("VP", ("VERB", "PP")), | |
("VP", ("VP", "PP")), | |
("PP", ("PREPOSITION", "NP")), | |
("DET", ("that" )), | |
("DET", ("this", )), | |
("DET", ("a", )), | |
("DET", ("the", )), | |
("NOUN", ("book", )), | |
("NOUN", ("flight", )), | |
("NOUN", ("meal", )), | |
("NOUN", ("money", )), | |
("VERB", ("book", )), | |
("VERB", ("include", )), | |
("VERB", ("prefer", )), | |
("PRONOUN", ("I", )), | |
("PRONOUN", ("she", )), | |
("PRONOUN", ("me", )), | |
("PROPER-NOUN", ("TWA", )), | |
("PROPER-NOUN", ("Houston", )), | |
("AUX", ("does", )), | |
("PREPOSITION", ("from", )), | |
("PREPOSITION", ("to", )), | |
("PREPOSITION", ("on", )), | |
("PREPOSITION", ("near", )), | |
("PREPOSITION", ("through", )), | |
] | |
def index_rules(rules): | |
indexing = defaultdict(list) | |
for A,B in rules: | |
indexing[B].append(A) | |
return indexing | |
for tree in CYK_parse(["does", "the", "flight", "include", "a", "meal"], index_rules(raw_rules)): | |
print tree |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment