Last active
December 27, 2015 02:09
-
-
Save spaghetti-source/7250727 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // | |
| // (Simplified) Sequence Memoizer | |
| // for (unbounded depth) Kneser-Ney smoothing | |
| // | |
| // Reference: | |
| // | |
| // - J. Gasthaus, F. Wood, and Y. W. Teh (2010): | |
| // Lossless compression based on the Sequence Memoizer. | |
| // In Proceedings of Data Compression Conference, | |
| // pp. 337-345. | |
| // | |
| // - F. Wood, J. Gasthaus, G. Archambeau, L. James, and Y. W. Teh (2011): | |
| // The sequence memoizer. | |
| // Communications of the ACM, | |
| // vol. 54, no. 2, pp. 91-98. | |
| // | |
| // | |
| // Sample Input: | |
| // ap.txt in http://www.cs.princeton.edu/~blei/lda-c/ap.tgz | |
| // | |
| // | |
| #include <iostream> | |
| #include <vector> | |
| #include <cstdio> | |
| #include <cstdlib> | |
| #include <map> | |
| #include <sstream> | |
| #include <cmath> | |
| #include <cstring> | |
| #include <fstream> | |
| #include <functional> | |
| #include <algorithm> | |
| #include <unordered_map> | |
| #include <iterator> | |
| #include <ctime> | |
| using namespace std; | |
| template <class T> | |
| struct dictionary { | |
| unordered_map<T, size_t> dict; | |
| vector<T> idict; | |
| bool has(T x) const { | |
| return dict.count(x); | |
| } | |
| size_t id(T x) { | |
| if (!has(x)) { | |
| dict[x] = idict.size(); | |
| idict.push_back(x); | |
| } | |
| return dict[x]; | |
| } | |
| T value(size_t id) { | |
| return idict[id]; | |
| } | |
| size_t size() const { | |
| return idict.size(); | |
| } | |
| }; | |
| double tick() { | |
| static clock_t oldtick; | |
| clock_t newtick = clock(); | |
| double diff = 1.0*(newtick - oldtick) / CLOCKS_PER_SEC; | |
| oldtick = newtick; | |
| return diff; | |
| } | |
| // --- | |
| struct sequence_memoizer { | |
| vector<size_t> x; | |
| struct node { | |
| size_t count; | |
| unordered_map<size_t, tuple<size_t, size_t, node*>> link; | |
| } *root; | |
| size_t total_nodes; | |
| node *make_node() { | |
| ++total_nodes; | |
| node *t = new node(); | |
| t->count = 0; | |
| return t; | |
| } | |
| sequence_memoizer() { | |
| total_nodes = 0; | |
| root = make_node(); | |
| } | |
| void insert(size_t i, size_t j) { | |
| for (node *p = root; ;) { | |
| ++(p->count); | |
| if (i == j) return; | |
| if (!p->link.count(x[i])) | |
| p->link[x[i]] = make_tuple(i, j, make_node()); | |
| auto &e = p->link[ x[i] ]; | |
| size_t I = get<0>(e), J = get<1>(e); | |
| for (; x[i] == x[I] && i < j && I < J; ++i, ++I); | |
| if (I == J) { | |
| p = get<2>(e); | |
| } else { | |
| node *r = make_node(); | |
| r->count = get<2>(e)->count; | |
| r->link[x[I]] = make_tuple(I, J, get<2>(e)); | |
| get<1>(e) = I; | |
| get<2>(e) = r; | |
| p = r; | |
| } | |
| } | |
| } | |
| void insert(vector<size_t> y) { | |
| size_t i = x.size(); | |
| copy(y.begin(), y.end(), back_inserter(x)); | |
| x.push_back(-1); | |
| for (size_t k = i; k < x.size(); ++k) | |
| insert(k, x.size()); | |
| } | |
| tuple<size_t, size_t, size_t> run(size_t a, vector<size_t> y, size_t i) const { | |
| size_t j = y.size(); | |
| for (node *p = root; ;) { | |
| if (i == j) { | |
| size_t c = p->count, n = p->link.size(), ca = 0; | |
| if (p->link.count(-1)) { | |
| c -= get<2>(p->link[-1])->count; | |
| n -= 1; | |
| } | |
| if (p->link.count(a)) ca = get<2>(p->link[a])->count; | |
| return make_tuple(ca, c, n); | |
| } | |
| if (!p->link.count(y[i])) return make_tuple(0, 0, 0); | |
| const auto &e = p->link[ y[i] ]; | |
| size_t I = get<0>(e), J = get<1>(e); | |
| for (; y[i] == x[I] && i < j && I < J; ++i, ++I); | |
| if (I == J) p = get<2>(e); | |
| else if (i != j) return make_tuple(0, 0, 0); // x[i] != x[I] | |
| else { | |
| size_t c = get<2>(e)->count; | |
| return make_tuple(a != x[I] ? 0 : c, c, 1); | |
| } | |
| } | |
| } | |
| // conditional probability P(a|y) | |
| double probability(size_t a, vector<size_t> y) { | |
| const double d = 0.75; | |
| double P = 1.0 / (root->link.size() - 1); | |
| for (size_t i = y.size(); i <= y.size(); --i) { | |
| tuple<size_t, size_t, size_t> r = run(a, y, i); | |
| if (get<1>(r) == 0) break; | |
| P = (max(0.0, get<0>(r) - d) + d * get<2>(r) * P) / get<1>(r); | |
| } | |
| return P; | |
| } | |
| size_t size() const { return total_nodes; } | |
| }; | |
| dictionary<string> wdic; | |
| int main() { | |
| tick(); | |
| sequence_memoizer SM; | |
| FILE *fp = fopen("ap.txt", "r"); | |
| size_t line = 0; | |
| for (char buf[10000]; fgets(buf, sizeof(buf), fp); ) { | |
| if (buf[0] == '<') continue; | |
| int n = strlen(buf); | |
| for (int i = 0; i < n; ++i) | |
| if (!isalnum(buf[i])) buf[i] = ' '; | |
| stringstream ss(buf); | |
| vector<size_t> x; | |
| for (string s; ss >> s; ) | |
| x.push_back( wdic.id(s) ); | |
| if (!x.empty()) { | |
| SM.insert(x); | |
| ++line; | |
| if (line % 100 == 0) | |
| cout << line << " " << wdic.size() << " " << SM.size() << " " << SM.root->count << endl; | |
| } | |
| } | |
| for (char buf[10000]; fgets(buf, sizeof(buf), stdin); ) { | |
| stringstream ss(buf); | |
| vector<size_t> x; | |
| size_t k = 0; | |
| for (string s; ss >> s; ) { | |
| x.push_back( wdic.id(s) ); | |
| } | |
| if (x.size() > 0) { | |
| size_t a = x.back(); | |
| x.pop_back(); | |
| cout << SM.probability(a,x) << endl; | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment