Skip to content

Instantly share code, notes, and snippets.

@jcreedcmu
Last active July 17, 2024 23:06
Show Gist options
  • Save jcreedcmu/55a47fcd4f1a8f7f5eb0131002be3165 to your computer and use it in GitHub Desktop.
Save jcreedcmu/55a47fcd4f1a8f7f5eb0131002be3165 to your computer and use it in GitHub Desktop.
cvm.js
// A quick and dirty lil implementation of the CVM algorithm
// (https://arxiv.org/pdf/2301.10191)
//
// I put Moby Dick (https://www.gutenberg.org/cache/epub/2701/pg2701.txt)
// into /tmp/mb.txt
const fs = require('fs');
const mb = fs.readFileSync('/tmp/mb.txt', 'utf8')
.replace(/[^a-zA-Z-’]/g, ' ')
.split(/\s+/);
console.log('words', mb.length);
console.time('count');
const unique = {};
mb.forEach(word => {
unique[word] = 1;
});
console.log('unique words', Object.keys(unique).length);
console.timeEnd('count');
console.time('estimate');
const N = 1000;
const basket = {};
let p = 1;
let count = 0;
mb.forEach(word => {
if (basket[word]) {
delete basket[word];
count--;
}
if (Math.random() < p) {
basket[word] = 1;
count++;
}
while (count >= N) {
p = p/2;
const ks = Object.keys(basket);
ks.forEach(k => {
if (Math.random() < 0.5) {
delete basket[k];
count--;
}
});
}
});
console.log('final p: 1/', 1/p);
console.log('word sample:', Object.keys(basket).join(', '));
console.log('estimate:', Object.keys(basket).length / p)
console.timeEnd('estimate');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment