Skip to content

Instantly share code, notes, and snippets.

@spaghetti-source
Last active December 27, 2015 02:09
Show Gist options
  • Select an option

  • Save spaghetti-source/7250727 to your computer and use it in GitHub Desktop.

Select an option

Save spaghetti-source/7250727 to your computer and use it in GitHub Desktop.
//
// (Simplified) Sequence Memoizer
// for (unbounded depth) Kneser-Ney smoothing
//
// Reference:
//
// - J. Gasthaus, F. Wood, and Y. W. Teh (2010):
// Lossless compression based on the Sequence Memoizer.
// In Proceedings of Data Compression Conference,
// pp. 337-345.
//
// - F. Wood, J. Gasthaus, G. Archambeau, L. James, and Y. W. Teh (2011):
// The sequence memoizer.
// Communications of the ACM,
// vol. 54, no. 2, pp. 91-98.
//
//
// Sample Input:
// ap.txt in http://www.cs.princeton.edu/~blei/lda-c/ap.tgz
//
//
#include <iostream>
#include <vector>
#include <cstdio>
#include <cstdlib>
#include <map>
#include <sstream>
#include <cmath>
#include <cstring>
#include <fstream>
#include <functional>
#include <algorithm>
#include <unordered_map>
#include <iterator>
#include <ctime>
using namespace std;
template <class T>
struct dictionary {
unordered_map<T, size_t> dict;
vector<T> idict;
bool has(T x) const {
return dict.count(x);
}
size_t id(T x) {
if (!has(x)) {
dict[x] = idict.size();
idict.push_back(x);
}
return dict[x];
}
T value(size_t id) {
return idict[id];
}
size_t size() const {
return idict.size();
}
};
double tick() {
static clock_t oldtick;
clock_t newtick = clock();
double diff = 1.0*(newtick - oldtick) / CLOCKS_PER_SEC;
oldtick = newtick;
return diff;
}
// ---
struct sequence_memoizer {
vector<size_t> x;
struct node {
size_t count;
unordered_map<size_t, tuple<size_t, size_t, node*>> link;
} *root;
size_t total_nodes;
node *make_node() {
++total_nodes;
node *t = new node();
t->count = 0;
return t;
}
sequence_memoizer() {
total_nodes = 0;
root = make_node();
}
void insert(size_t i, size_t j) {
for (node *p = root; ;) {
++(p->count);
if (i == j) return;
if (!p->link.count(x[i]))
p->link[x[i]] = make_tuple(i, j, make_node());
auto &e = p->link[ x[i] ];
size_t I = get<0>(e), J = get<1>(e);
for (; x[i] == x[I] && i < j && I < J; ++i, ++I);
if (I == J) {
p = get<2>(e);
} else {
node *r = make_node();
r->count = get<2>(e)->count;
r->link[x[I]] = make_tuple(I, J, get<2>(e));
get<1>(e) = I;
get<2>(e) = r;
p = r;
}
}
}
void insert(vector<size_t> y) {
size_t i = x.size();
copy(y.begin(), y.end(), back_inserter(x));
x.push_back(-1);
for (size_t k = i; k < x.size(); ++k)
insert(k, x.size());
}
tuple<size_t, size_t, size_t> run(size_t a, vector<size_t> y, size_t i) const {
size_t j = y.size();
for (node *p = root; ;) {
if (i == j) {
size_t c = p->count, n = p->link.size(), ca = 0;
if (p->link.count(-1)) {
c -= get<2>(p->link[-1])->count;
n -= 1;
}
if (p->link.count(a)) ca = get<2>(p->link[a])->count;
return make_tuple(ca, c, n);
}
if (!p->link.count(y[i])) return make_tuple(0, 0, 0);
const auto &e = p->link[ y[i] ];
size_t I = get<0>(e), J = get<1>(e);
for (; y[i] == x[I] && i < j && I < J; ++i, ++I);
if (I == J) p = get<2>(e);
else if (i != j) return make_tuple(0, 0, 0); // x[i] != x[I]
else {
size_t c = get<2>(e)->count;
return make_tuple(a != x[I] ? 0 : c, c, 1);
}
}
}
// conditional probability P(a|y)
double probability(size_t a, vector<size_t> y) {
const double d = 0.75;
double P = 1.0 / (root->link.size() - 1);
for (size_t i = y.size(); i <= y.size(); --i) {
tuple<size_t, size_t, size_t> r = run(a, y, i);
if (get<1>(r) == 0) break;
P = (max(0.0, get<0>(r) - d) + d * get<2>(r) * P) / get<1>(r);
}
return P;
}
size_t size() const { return total_nodes; }
};
dictionary<string> wdic;
int main() {
tick();
sequence_memoizer SM;
FILE *fp = fopen("ap.txt", "r");
size_t line = 0;
for (char buf[10000]; fgets(buf, sizeof(buf), fp); ) {
if (buf[0] == '<') continue;
int n = strlen(buf);
for (int i = 0; i < n; ++i)
if (!isalnum(buf[i])) buf[i] = ' ';
stringstream ss(buf);
vector<size_t> x;
for (string s; ss >> s; )
x.push_back( wdic.id(s) );
if (!x.empty()) {
SM.insert(x);
++line;
if (line % 100 == 0)
cout << line << " " << wdic.size() << " " << SM.size() << " " << SM.root->count << endl;
}
}
for (char buf[10000]; fgets(buf, sizeof(buf), stdin); ) {
stringstream ss(buf);
vector<size_t> x;
size_t k = 0;
for (string s; ss >> s; ) {
x.push_back( wdic.id(s) );
}
if (x.size() > 0) {
size_t a = x.back();
x.pop_back();
cout << SM.probability(a,x) << endl;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment