Skip to content

Instantly share code, notes, and snippets.

@xiaohan2012
Created October 25, 2014 11:18
Show Gist options
  • Save xiaohan2012/81982c8ac2e2c5faa719 to your computer and use it in GitHub Desktop.
Save xiaohan2012/81982c8ac2e2c5faa719 to your computer and use it in GitHub Desktop.
CYK parsing algorithm
"""
The CYK parsing algorithm
"""
from collections import defaultdict
from itertools import product
import texttable #needs to be installed, can be accessed at http://foutaise.org/code/texttable/
def _print_tabel(table, nrow, ncol):
"""Utility function to print to dynamic programming table"""
t = texttable.Texttable()
rows = [[" "] + map(str, xrange(1, ncol+1))]
for i in xrange(-1, nrow-1):
row_content = ['\n'.join(table[i][j]) for j in xrange(ncol)]
rows.append([str(i+1)] + row_content)
t.add_rows(rows)
print t.draw() + "\n"
def CYK_parse(words, rules):
"""
Params:
------
words: list of words
rules: list of production rules indexed by the right hand side term
Return:
------
Possible parses of the input words
"""
table = defaultdict(lambda : defaultdict(list)) #dynamic programming table
bp = defaultdict(lambda : defaultdict(lambda : defaultdict(list))) #back-pointer table for parse tree reconstruction
for j in xrange(len(words)):
for A in rules[tuple([words[j]])]: #all derivations, A that A -> words[j]
table[j-1][j].append(A)
bp[j-1][j][A].append(words[j])
for i in xrange(j-2, -2, -1):
for k in xrange(i+1, j):
for B, C in product(table[i][k], table[k][j]):
for A in rules[(B, C)]:
table[i][j].append(A)
bp[i][j][A].append(((i,k,B), (k,j, C)))
_print_tabel(table, len(words), len(words))
def recons(bp, i, j, sym):
"""
Reconstruct the parse tree
"""
trees = []
if len(bp[i][j][sym]) == 1 and isinstance(bp[i][j][sym][0], str): # terminals
return [(sym, bp[i][j][sym][0])]
else:
for p1, p2 in bp[i][j][sym]:
i,k,B = p1
k,j,C = p2
for left_tree, right_tree in product(recons(bp, i, k, B),
recons(bp, k, j, C)):
trees.append((sym, left_tree, right_tree))
return trees
return recons(bp, -1, len(words)-1, "S")
if __name__ == "__main__":
raw_rules = [
("S", ("NP", "VP")),
("S", ("X1", "VP")),
("X1", ("AUX", "NP")),
("S", ("book",)),
("S", ("include",)),
("S", ("prefer",)),
("S", ("VERB", "NP")),
("S", ("X2", "PP")),
("S", ("VERB", "PP")),
("S", ("VP", "PP")),
("NP", ("I",)),
("NP", ("she",)),
("NP", ("me",)),
("NP", ("TWA",)),
("NP", ("Houston",)),
("NP", ("DET", "NOMINAL")),
("NOMINAL", ("book",)),
("NOMINAL", ("flight",)),
("NOMINAL", ("meal",)),
("NOMINAL", ("money",)),
("NOMINAL", ("NOMINAL", "NOUN")),
("NOMINAL", ("NOMINAL", "PP")),
# lexicons
("VP", ("book",)),
("VP", ("include",)),
("VP", ("prefer",)),
("VP", ("VERB", "NP")),
("VP", ("X2", "PP")),
("X2", ("VERB", "NP")),
("VP", ("VERB", "PP")),
("VP", ("VP", "PP")),
("PP", ("PREPOSITION", "NP")),
("DET", ("that" )),
("DET", ("this", )),
("DET", ("a", )),
("DET", ("the", )),
("NOUN", ("book", )),
("NOUN", ("flight", )),
("NOUN", ("meal", )),
("NOUN", ("money", )),
("VERB", ("book", )),
("VERB", ("include", )),
("VERB", ("prefer", )),
("PRONOUN", ("I", )),
("PRONOUN", ("she", )),
("PRONOUN", ("me", )),
("PROPER-NOUN", ("TWA", )),
("PROPER-NOUN", ("Houston", )),
("AUX", ("does", )),
("PREPOSITION", ("from", )),
("PREPOSITION", ("to", )),
("PREPOSITION", ("on", )),
("PREPOSITION", ("near", )),
("PREPOSITION", ("through", )),
]
def index_rules(rules):
indexing = defaultdict(list)
for A,B in rules:
indexing[B].append(A)
return indexing
for tree in CYK_parse(["does", "the", "flight", "include", "a", "meal"], index_rules(raw_rules)):
print tree
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment