Last active
April 6, 2018 22:54
-
-
Save xiaohan2012/b9e3ab0ac5d23362bf33 to your computer and use it in GitHub Desktop.
CYK parsing algorithm in C++
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "btree.h" | |
| BTreeNode::BTreeNode(std::string _name, BTreeNode *_left, BTreeNode *_right): | |
| name(_name), left(_left), right(_right){}; | |
| BTreeNode::BTreeNode(std::string _name, BTreeNode *_left): | |
| name(_name), left(_left), right(NULL){}; | |
| BTreeNode::BTreeNode(std::string _name): | |
| name(_name), left(NULL), right(NULL){}; | |
| std::string BTreeNode::string_repr(){ | |
| if(is_leaf()){ | |
| return "'" + name + "'"; | |
| } | |
| else if(has_single_child()){ | |
| return "('" + name + "', '" + left->string_repr() +"'" + ")"; | |
| } | |
| else{ | |
| std::string left_str = left->string_repr(), right_str = right->string_repr(); | |
| return "('" + name + "', '" + left_str + "', '" + right_str + "')"; | |
| } | |
| }; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <string> | |
| class BTreeNode { | |
| public: | |
| std::string name; | |
| BTreeNode * left, * right; | |
| BTreeNode(std::string _name, BTreeNode *_left, BTreeNode *_right); | |
| BTreeNode(std::string _name, BTreeNode *_left); | |
| BTreeNode(std::string _name); | |
| std::string string_repr(); | |
| bool has_single_child(){ | |
| return left != NULL && right == NULL; | |
| } | |
| bool is_leaf(){ | |
| return left == NULL && right == NULL; | |
| } | |
| }; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| all: btree types.h rule table cyk main | |
| g++-4.7 -std=c++11 *.o -o main | |
| main: main.cpp | |
| g++-4.7 -c -std=c++11 main.cpp | |
| cyk: cyk.cpp | |
| g++-4.7 -c -std=c++11 cyk.cpp | |
| btree: btree.cpp | |
| g++-4.7 -c -std=c++11 btree.cpp | |
| rule: rule.cpp | |
| g++-4.7 -c -std=c++11 rule.cpp | |
| table: table.cpp | |
| g++-4.7 -c -std=c++11 table.cpp | |
| clean: | |
| rm *.o | |
| rm main |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "cyk.h" | |
| /* | |
| Given the backpointer, return the possible parse trees | |
| */ | |
| std::vector<BTreeNode*> reconstruct(std::vector<std::vector<BPTableEntry>> bp, int i, int j, std::string symbol){ | |
| /* | |
| bp: the backpointer table | |
| */ | |
| if(bp.at(i).at(j)[symbol].size() == 1 && bp.at(i).at(j)[symbol][0]->is_single()){ //the leaf | |
| BPSingleEntry *e = (BPSingleEntry *)bp.at(i).at(j)[symbol][0]; | |
| BTreeNode *t = new BTreeNode {symbol, | |
| new BTreeNode(e->name, NULL, NULL), | |
| NULL}; | |
| return std::vector<BTreeNode*> {t}; | |
| } | |
| else{ //internal node | |
| std::vector<BTreeNode*> trees = std::vector<BTreeNode*> {}; | |
| for(auto temp : bp.at(i).at(j)[symbol]){ | |
| BPTupleEntry *e = (BPTupleEntry *)temp; | |
| for(auto ltree : reconstruct(bp, e->l1, e->l2, e->lname)){ | |
| for(auto rtree : reconstruct(bp, e->r1, e->r2, e->rname)){ | |
| trees.push_back(new BTreeNode(symbol, ltree, rtree)); | |
| } | |
| } | |
| } | |
| return trees; | |
| } | |
| } | |
| std::vector<BTreeNode*> cyk(const StringVector & words, RuleVector & rules){ | |
| // build indexed rules | |
| IndexedRules indexed_rules; | |
| for(auto i = 0; i < rules.size(); ++i){ | |
| IndexedRulesKey key = {rules[i]->right1, rules[i]->right2}; | |
| if(indexed_rules.find(key) == indexed_rules.end()){ // key not exist | |
| indexed_rules[key] = StringVector {}; | |
| } | |
| indexed_rules[key].push_back(rules[i]->left); | |
| } | |
| // Create the 2D dynamic programming table in which each cell contains a vector of produced symbol(string) | |
| // The table dimension should be N x N, where N is the word count | |
| std::vector<std::vector<StringVector>> table; | |
| // The backpointer | |
| std::vector<std::vector<BPTableEntry>> bp; | |
| //init the containers | |
| int N = words.size()+1; | |
| for(int i = 0; i < N; ++i){ | |
| table.push_back(std::vector<StringVector> {}); | |
| bp.push_back(std::vector<BPTableEntry> {}); | |
| for(int j=0; j < N; ++j){ | |
| table.at(i).push_back(StringVector {}); | |
| bp.at(i).push_back(BPTableEntry {}); | |
| } | |
| } | |
| //start filling in the cell contents | |
| for(int j=1; j < N; ++j){ | |
| std::string word = words.at(j-1); | |
| IndexedRulesKey key = {word, ""}; | |
| StringVector matching_symbols = indexed_rules[key]; | |
| for(auto &A_symbol : matching_symbols){ | |
| table.at(j-1).at(j).push_back(A_symbol); | |
| if(bp.at(j-1).at(j).find(A_symbol) == bp.at(j-1).at(j).end()){ //if key does not exist, create one | |
| bp.at(j-1).at(j)[A_symbol] = std::vector<BPTableEntryItem*> {}; | |
| } | |
| bp.at(j-1).at(j)[A_symbol].push_back(new BPSingleEntry(word)); | |
| } | |
| for(int i=j-2; i>=0; --i){ | |
| for(int k = i+1; k < j; ++k){ | |
| if(table.at(i).at(k).size() > 0 && table.at(k).at(j).size() > 0){ //if table entry is not empty | |
| StringVector B_symbols = table.at(i).at(k); | |
| StringVector C_symbols = table.at(k).at(j); | |
| for(auto B_symbol : B_symbols){ | |
| for(auto C_symbol : C_symbols){ | |
| IndexedRulesKey key = {B_symbol, C_symbol}; | |
| StringVector matching_symbols = indexed_rules[key]; | |
| for(auto A_symbol : matching_symbols) { | |
| table.at(i).at(j).push_back(A_symbol); | |
| if(bp.at(i).at(j).find(A_symbol) == bp.at(i).at(j).end()){ //if key ont exist, create one | |
| bp.at(i).at(j)[A_symbol] = std::vector<BPTableEntryItem*> {}; | |
| } | |
| bp.at(i).at(j)[A_symbol].push_back(new BPTupleEntry(i, k, B_symbol, k, j, C_symbol)); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return reconstruct(bp, 0, N-1, "S"); | |
| } | |
| h |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #ifndef RULE | |
| #define RULE | |
| #include "rule.h" | |
| #endif | |
| #ifndef BTREE | |
| #define BTREE | |
| #include "btree.h" | |
| #endif | |
| #ifndef INDEX_KEY | |
| #define INDEX_KEY | |
| #include "index_key.h" | |
| #endif | |
| #ifndef TYPES | |
| #define TYPES | |
| #include "types.h" | |
| #endif | |
| #ifndef TABLE | |
| #define TABLE | |
| #include "table.h" | |
| #endif | |
| #include <iostream> | |
| #include <string> | |
| #include <unordered_map> | |
| #include <vector> | |
| std::vector<BTreeNode*> cyk(const StringVector & words, RuleVector & rules); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <string> | |
| #include <unordered_map> | |
| #ifndef TYPES | |
| #define TYPES | |
| #include "types.h" | |
| #endif | |
| struct IndexedRulesKey{ | |
| std::string left, right; | |
| bool operator==(const IndexedRulesKey &other) const { | |
| return left == other.left && right == other.right; | |
| } | |
| std::string str(){ | |
| return "\"" + left + "\", "+ "\"" + right + "\", "; | |
| } | |
| }; | |
| namespace std { | |
| template <> | |
| struct hash<IndexedRulesKey> | |
| { | |
| std::size_t operator()(const IndexedRulesKey& k) const | |
| { | |
| using std::size_t; | |
| using std::hash; | |
| using std::string; | |
| // Compute individual hash values for first, | |
| // second and third and combine them using XOR | |
| // and bit shifting: | |
| return ((hash<string>()(k.left) | |
| ^ (hash<string>()(k.right) << 1)) >> 1); | |
| } | |
| }; | |
| } | |
| typedef std::unordered_map<IndexedRulesKey, StringVector> IndexedRules; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "cyk.h" | |
| #include <iostream> | |
| int main(){ | |
| RuleVector rules {new Rule("S", "NP", "VP"), // **QUESTION**: how to make it on the stack? | |
| new Rule("S", "X1", "VP"), | |
| new Rule("X1", "AUX", "NP"), | |
| new Rule("S", "book"), | |
| new Rule("S", "include"), | |
| new Rule("S", "prefer"), | |
| new Rule("S", "VERB", "NP"), | |
| new Rule("S", "X2", "PP"), | |
| new Rule("S", "VERB", "PP"), | |
| new Rule("S", "VP", "PP"), | |
| new Rule("NP", "I"), | |
| new Rule("NP", "she"), | |
| new Rule("NP", "me"), | |
| new Rule("NP", "TWA"), | |
| new Rule("NP", "Houston"), | |
| new Rule("NP", "DET", "NOMINAL"), | |
| new Rule("NOMINAL", "book"), | |
| new Rule("NOMINAL", "flight"), | |
| new Rule("NOMINAL", "meal"), | |
| new Rule("NOMINAL", "money"), | |
| new Rule("NOMINAL", "NOMINAL", "NOUN"), | |
| new Rule("NOMINAL", "NOMINAL", "PP"), | |
| //lexicons | |
| new Rule("VP", "book"), | |
| new Rule("VP", "include"), | |
| new Rule("VP", "prefer"), | |
| new Rule("VP", "VERB", "NP"), | |
| new Rule("VP", "X2", "PP"), | |
| new Rule("X2", "VERB", "NP"), | |
| new Rule("VP", "VERB", "PP"), | |
| new Rule("VP", "VP", "PP"), | |
| new Rule("PP", "PREPOSITION", "NP"), | |
| new Rule("DET", "that" ), | |
| new Rule("DET", "this"), | |
| new Rule("DET", "a"), | |
| new Rule("DET", "the"), | |
| new Rule("NOUN", "book"), | |
| new Rule("NOUN", "flight"), | |
| new Rule("NOUN", "meal"), | |
| new Rule("NOUN", "money"), | |
| new Rule("VERB", "book"), | |
| new Rule("VERB", "include"), | |
| new Rule("VERB", "prefer"), | |
| new Rule("PRONOUN", "I"), | |
| new Rule("PRONOUN", "she"), | |
| new Rule("PRONOUN", "me"), | |
| new Rule("PROPER-NOUN", "TWA"), | |
| new Rule("PROPER-NOUN", "Houston"), | |
| new Rule("AUX", "does"), | |
| new Rule("PREPOSITION", "from"), | |
| new Rule("PREPOSITION", "to"), | |
| new Rule("PREPOSITION", "on"), | |
| new Rule("PREPOSITION", "near"), | |
| new Rule("PREPOSITION", "through"), | |
| new Rule("S", "Book")}; | |
| StringVector sents {"does", "the", "flight", "include", "a", "meal"}; | |
| std::vector<BTreeNode*> parses = cyk(sents, rules); | |
| for(auto tree : parses){ | |
| std::cout << tree->string_repr() << std::endl; | |
| } | |
| //remember to destroy the rules | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "rule.h" | |
| Rule::Rule(std::string _l, std::string _r1, std::string _r2): | |
| left(_l), right1(_r1), right2(_r2){}; | |
| Rule::Rule(std::string _l, std::string _r1): | |
| left(_l), right1(_r1), right2(""){}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <string> | |
| #include <vector> | |
| class Rule{ // 2st order production rule | |
| public: | |
| Rule(std::string _l, std::string _r1, std::string _r2); | |
| Rule(std::string _l, std::string _r1); | |
| bool is_first_order(){ | |
| return right2.length() == 0; | |
| } | |
| bool is_second_order(){ | |
| return !is_first_order(); | |
| } | |
| std::string left, right1, right2; // the two right terms | |
| }; | |
| typedef std::vector<Rule*> RuleVector; //vector of rule **pointer** for dynamic type casting |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "table.h" | |
| void print_dp_table(std::vector<std::vector<StringVector>> &table, int N){ | |
| int width = 20; | |
| std::cout << "\t"; | |
| for(int j = 0; j < N; ++j){ | |
| std::cout << std::setw(width) << j; //the jth column | |
| } | |
| //-- line separation | |
| std::cout << std::endl; | |
| for(int i=0; i < (1+N)*width; ++i){ | |
| std::cout << "-"; | |
| } | |
| std::cout << std::endl; | |
| for(int i = 0; i < N; ++i){ | |
| std::cout << i << "\t"; //the ith row | |
| for(int j = 0; j < N; ++j){ | |
| std::string cell_content = ""; | |
| for(auto &stuff : table.at(i).at(j)){ | |
| cell_content += (stuff + " "); | |
| } | |
| std::cout << std::setw(width) << cell_content; | |
| } | |
| //-- line separation | |
| std::cout << std::endl; | |
| for(int i=0; i < (1+N)*width; ++i){ | |
| std::cout << "-"; | |
| } | |
| std::cout << std::endl; | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #ifndef TYPES | |
| #define TYPES | |
| #include "types.h" | |
| #endif | |
| #include <iostream> | |
| #include <iomanip> | |
| #include <string> | |
| #include <vector> | |
| #include <unordered_map> | |
| /* | |
| Backpointer entry that can be either: | |
| - string | |
| - tuple of ((int, int, string), (int, int, string)) | |
| */ | |
| class BPTableEntryItem{ | |
| public: | |
| virtual bool is_single(){}; | |
| }; | |
| class BPSingleEntry: public BPTableEntryItem{ | |
| public: | |
| BPSingleEntry(std::string _name): name(_name){}; | |
| std::string name; | |
| bool is_single(){ | |
| return true; | |
| } | |
| }; | |
| class BPTupleEntry: public BPTableEntryItem{ | |
| public: | |
| BPTupleEntry(int _l1, int _l2, std::string _lname, int _r1, int _r2, std::string _rname): | |
| l1(_l1), l2(_l2), lname(_lname), r1(_r1), r2(_r2), rname(_rname){}; | |
| int l1, l2, r1, r2; | |
| std::string lname, rname; | |
| bool is_single(){ | |
| return false; | |
| } | |
| }; | |
| void print_dp_table(std::vector<std::vector<StringVector>> &table, int N); | |
| typedef std::unordered_map<std::string, std::vector<BPTableEntryItem*>> BPTableEntry; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <string> | |
| #include <vector> | |
| typedef std::vector<std::string> StringVector; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment