Skip to content

Instantly share code, notes, and snippets.

@xiaohan2012
Last active April 6, 2018 22:54
Show Gist options
  • Save xiaohan2012/b9e3ab0ac5d23362bf33 to your computer and use it in GitHub Desktop.
Save xiaohan2012/b9e3ab0ac5d23362bf33 to your computer and use it in GitHub Desktop.
CYK parsing algorithm in C++
#include "btree.h"
BTreeNode::BTreeNode(std::string _name, BTreeNode *_left, BTreeNode *_right):
name(_name), left(_left), right(_right){};
BTreeNode::BTreeNode(std::string _name, BTreeNode *_left):
name(_name), left(_left), right(NULL){};
BTreeNode::BTreeNode(std::string _name):
name(_name), left(NULL), right(NULL){};
std::string BTreeNode::string_repr(){
if(is_leaf()){
return "'" + name + "'";
}
else if(has_single_child()){
return "('" + name + "', '" + left->string_repr() +"'" + ")";
}
else{
std::string left_str = left->string_repr(), right_str = right->string_repr();
return "('" + name + "', '" + left_str + "', '" + right_str + "')";
}
};
#include <string>
class BTreeNode {
public:
std::string name;
BTreeNode * left, * right;
BTreeNode(std::string _name, BTreeNode *_left, BTreeNode *_right);
BTreeNode(std::string _name, BTreeNode *_left);
BTreeNode(std::string _name);
std::string string_repr();
bool has_single_child(){
return left != NULL && right == NULL;
}
bool is_leaf(){
return left == NULL && right == NULL;
}
};
all: btree types.h rule table cyk main
g++-4.7 -std=c++11 *.o -o main
main: main.cpp
g++-4.7 -c -std=c++11 main.cpp
cyk: cyk.cpp
g++-4.7 -c -std=c++11 cyk.cpp
btree: btree.cpp
g++-4.7 -c -std=c++11 btree.cpp
rule: rule.cpp
g++-4.7 -c -std=c++11 rule.cpp
table: table.cpp
g++-4.7 -c -std=c++11 table.cpp
clean:
rm *.o
rm main
#include "cyk.h"
/*
Given the backpointer, return the possible parse trees
*/
std::vector<BTreeNode*> reconstruct(std::vector<std::vector<BPTableEntry>> bp, int i, int j, std::string symbol){
/*
bp: the backpointer table
*/
if(bp.at(i).at(j)[symbol].size() == 1 && bp.at(i).at(j)[symbol][0]->is_single()){ //the leaf
BPSingleEntry *e = (BPSingleEntry *)bp.at(i).at(j)[symbol][0];
BTreeNode *t = new BTreeNode {symbol,
new BTreeNode(e->name, NULL, NULL),
NULL};
return std::vector<BTreeNode*> {t};
}
else{ //internal node
std::vector<BTreeNode*> trees = std::vector<BTreeNode*> {};
for(auto temp : bp.at(i).at(j)[symbol]){
BPTupleEntry *e = (BPTupleEntry *)temp;
for(auto ltree : reconstruct(bp, e->l1, e->l2, e->lname)){
for(auto rtree : reconstruct(bp, e->r1, e->r2, e->rname)){
trees.push_back(new BTreeNode(symbol, ltree, rtree));
}
}
}
return trees;
}
}
std::vector<BTreeNode*> cyk(const StringVector & words, RuleVector & rules){
// build indexed rules
IndexedRules indexed_rules;
for(auto i = 0; i < rules.size(); ++i){
IndexedRulesKey key = {rules[i]->right1, rules[i]->right2};
if(indexed_rules.find(key) == indexed_rules.end()){ // key not exist
indexed_rules[key] = StringVector {};
}
indexed_rules[key].push_back(rules[i]->left);
}
// Create the 2D dynamic programming table in which each cell contains a vector of produced symbol(string)
// The table dimension should be N x N, where N is the word count
std::vector<std::vector<StringVector>> table;
// The backpointer
std::vector<std::vector<BPTableEntry>> bp;
//init the containers
int N = words.size()+1;
for(int i = 0; i < N; ++i){
table.push_back(std::vector<StringVector> {});
bp.push_back(std::vector<BPTableEntry> {});
for(int j=0; j < N; ++j){
table.at(i).push_back(StringVector {});
bp.at(i).push_back(BPTableEntry {});
}
}
//start filling in the cell contents
for(int j=1; j < N; ++j){
std::string word = words.at(j-1);
IndexedRulesKey key = {word, ""};
StringVector matching_symbols = indexed_rules[key];
for(auto &A_symbol : matching_symbols){
table.at(j-1).at(j).push_back(A_symbol);
if(bp.at(j-1).at(j).find(A_symbol) == bp.at(j-1).at(j).end()){ //if key does not exist, create one
bp.at(j-1).at(j)[A_symbol] = std::vector<BPTableEntryItem*> {};
}
bp.at(j-1).at(j)[A_symbol].push_back(new BPSingleEntry(word));
}
for(int i=j-2; i>=0; --i){
for(int k = i+1; k < j; ++k){
if(table.at(i).at(k).size() > 0 && table.at(k).at(j).size() > 0){ //if table entry is not empty
StringVector B_symbols = table.at(i).at(k);
StringVector C_symbols = table.at(k).at(j);
for(auto B_symbol : B_symbols){
for(auto C_symbol : C_symbols){
IndexedRulesKey key = {B_symbol, C_symbol};
StringVector matching_symbols = indexed_rules[key];
for(auto A_symbol : matching_symbols) {
table.at(i).at(j).push_back(A_symbol);
if(bp.at(i).at(j).find(A_symbol) == bp.at(i).at(j).end()){ //if key ont exist, create one
bp.at(i).at(j)[A_symbol] = std::vector<BPTableEntryItem*> {};
}
bp.at(i).at(j)[A_symbol].push_back(new BPTupleEntry(i, k, B_symbol, k, j, C_symbol));
}
}
}
}
}
}
}
return reconstruct(bp, 0, N-1, "S");
}
h
#ifndef RULE
#define RULE
#include "rule.h"
#endif
#ifndef BTREE
#define BTREE
#include "btree.h"
#endif
#ifndef INDEX_KEY
#define INDEX_KEY
#include "index_key.h"
#endif
#ifndef TYPES
#define TYPES
#include "types.h"
#endif
#ifndef TABLE
#define TABLE
#include "table.h"
#endif
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>
std::vector<BTreeNode*> cyk(const StringVector & words, RuleVector & rules);
#include <string>
#include <unordered_map>
#ifndef TYPES
#define TYPES
#include "types.h"
#endif
struct IndexedRulesKey{
std::string left, right;
bool operator==(const IndexedRulesKey &other) const {
return left == other.left && right == other.right;
}
std::string str(){
return "\"" + left + "\", "+ "\"" + right + "\", ";
}
};
namespace std {
template <>
struct hash<IndexedRulesKey>
{
std::size_t operator()(const IndexedRulesKey& k) const
{
using std::size_t;
using std::hash;
using std::string;
// Compute individual hash values for first,
// second and third and combine them using XOR
// and bit shifting:
return ((hash<string>()(k.left)
^ (hash<string>()(k.right) << 1)) >> 1);
}
};
}
typedef std::unordered_map<IndexedRulesKey, StringVector> IndexedRules;
#include "cyk.h"
#include <iostream>
int main(){
RuleVector rules {new Rule("S", "NP", "VP"), // **QUESTION**: how to make it on the stack?
new Rule("S", "X1", "VP"),
new Rule("X1", "AUX", "NP"),
new Rule("S", "book"),
new Rule("S", "include"),
new Rule("S", "prefer"),
new Rule("S", "VERB", "NP"),
new Rule("S", "X2", "PP"),
new Rule("S", "VERB", "PP"),
new Rule("S", "VP", "PP"),
new Rule("NP", "I"),
new Rule("NP", "she"),
new Rule("NP", "me"),
new Rule("NP", "TWA"),
new Rule("NP", "Houston"),
new Rule("NP", "DET", "NOMINAL"),
new Rule("NOMINAL", "book"),
new Rule("NOMINAL", "flight"),
new Rule("NOMINAL", "meal"),
new Rule("NOMINAL", "money"),
new Rule("NOMINAL", "NOMINAL", "NOUN"),
new Rule("NOMINAL", "NOMINAL", "PP"),
//lexicons
new Rule("VP", "book"),
new Rule("VP", "include"),
new Rule("VP", "prefer"),
new Rule("VP", "VERB", "NP"),
new Rule("VP", "X2", "PP"),
new Rule("X2", "VERB", "NP"),
new Rule("VP", "VERB", "PP"),
new Rule("VP", "VP", "PP"),
new Rule("PP", "PREPOSITION", "NP"),
new Rule("DET", "that" ),
new Rule("DET", "this"),
new Rule("DET", "a"),
new Rule("DET", "the"),
new Rule("NOUN", "book"),
new Rule("NOUN", "flight"),
new Rule("NOUN", "meal"),
new Rule("NOUN", "money"),
new Rule("VERB", "book"),
new Rule("VERB", "include"),
new Rule("VERB", "prefer"),
new Rule("PRONOUN", "I"),
new Rule("PRONOUN", "she"),
new Rule("PRONOUN", "me"),
new Rule("PROPER-NOUN", "TWA"),
new Rule("PROPER-NOUN", "Houston"),
new Rule("AUX", "does"),
new Rule("PREPOSITION", "from"),
new Rule("PREPOSITION", "to"),
new Rule("PREPOSITION", "on"),
new Rule("PREPOSITION", "near"),
new Rule("PREPOSITION", "through"),
new Rule("S", "Book")};
StringVector sents {"does", "the", "flight", "include", "a", "meal"};
std::vector<BTreeNode*> parses = cyk(sents, rules);
for(auto tree : parses){
std::cout << tree->string_repr() << std::endl;
}
//remember to destroy the rules
return 0;
}
#include "rule.h"
Rule::Rule(std::string _l, std::string _r1, std::string _r2):
left(_l), right1(_r1), right2(_r2){};
Rule::Rule(std::string _l, std::string _r1):
left(_l), right1(_r1), right2(""){};
#include <string>
#include <vector>
class Rule{ // 2st order production rule
public:
Rule(std::string _l, std::string _r1, std::string _r2);
Rule(std::string _l, std::string _r1);
bool is_first_order(){
return right2.length() == 0;
}
bool is_second_order(){
return !is_first_order();
}
std::string left, right1, right2; // the two right terms
};
typedef std::vector<Rule*> RuleVector; //vector of rule **pointer** for dynamic type casting
#include "table.h"
void print_dp_table(std::vector<std::vector<StringVector>> &table, int N){
int width = 20;
std::cout << "\t";
for(int j = 0; j < N; ++j){
std::cout << std::setw(width) << j; //the jth column
}
//-- line separation
std::cout << std::endl;
for(int i=0; i < (1+N)*width; ++i){
std::cout << "-";
}
std::cout << std::endl;
for(int i = 0; i < N; ++i){
std::cout << i << "\t"; //the ith row
for(int j = 0; j < N; ++j){
std::string cell_content = "";
for(auto &stuff : table.at(i).at(j)){
cell_content += (stuff + " ");
}
std::cout << std::setw(width) << cell_content;
}
//-- line separation
std::cout << std::endl;
for(int i=0; i < (1+N)*width; ++i){
std::cout << "-";
}
std::cout << std::endl;
}
}
#ifndef TYPES
#define TYPES
#include "types.h"
#endif
#include <iostream>
#include <iomanip>
#include <string>
#include <vector>
#include <unordered_map>
/*
Backpointer entry that can be either:
- string
- tuple of ((int, int, string), (int, int, string))
*/
class BPTableEntryItem{
public:
virtual bool is_single(){};
};
class BPSingleEntry: public BPTableEntryItem{
public:
BPSingleEntry(std::string _name): name(_name){};
std::string name;
bool is_single(){
return true;
}
};
class BPTupleEntry: public BPTableEntryItem{
public:
BPTupleEntry(int _l1, int _l2, std::string _lname, int _r1, int _r2, std::string _rname):
l1(_l1), l2(_l2), lname(_lname), r1(_r1), r2(_r2), rname(_rname){};
int l1, l2, r1, r2;
std::string lname, rname;
bool is_single(){
return false;
}
};
void print_dp_table(std::vector<std::vector<StringVector>> &table, int N);
typedef std::unordered_map<std::string, std::vector<BPTableEntryItem*>> BPTableEntry;
#include <string>
#include <vector>
typedef std::vector<std::string> StringVector;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment