Last active
August 29, 2015 14:05
-
-
Save mtourne/f7e1e05fc64fc9f5cbb3 to your computer and use it in GitHub Desktop.
top k in Javascript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function topk_words(content, nbItems) { | |
var space_saver = new_space_saver(nbItems * 2); | |
var words = content.toLowerCase().replace(/\W/g, ' ').split(/\s+/); | |
for (var i = 0; i < words.length; i++) { | |
insert_space_saver(space_saver, words[i]); | |
} | |
return get_top_space_saver(space_saver, nbItems); | |
} | |
// create a new k element space saver | |
function new_space_saver(nbItems) { | |
var scores = { | |
// we'll be storing token, score (counter) and card (cardinality) | |
// for each element | |
// - scores.token[<token>] can access the struct by token | |
// - scores.card[<card>] can access the struct by card | |
token : {}, | |
card : [], | |
size : nbItems, | |
overrestimates: {} | |
}; | |
// initiate all counters to 0 | |
for (var i = 0; i < nbItems; i++) { | |
scores.card[i] = { | |
score: 0, | |
token: null, | |
card: i | |
}; | |
} | |
return scores; | |
} | |
// swap two scores based on cardinality in the set | |
function swap_scores_space_saver(scores, card1, card2) { | |
var t1 = scores.card[card1]; | |
var t2 = scores.card[card2]; | |
// swap the cardinalities | |
t1.card = card2; | |
t2.card = card1; | |
scores.card[card1] = t2; | |
scores.card[card2] = t1; | |
} | |
// add +1 to a token in the space saver | |
// Note: adding +n is a separate case | |
// (would require insertion in sorted list) | |
// Here, at most we'll do 1 swap | |
function inc_space_saver(scores, t) { | |
var new_score = t.score + 1; | |
t.score = new_score; | |
var swap_from = t.card; | |
var swap_to = t.card + 1; | |
var next_t = scores.card[swap_to]; | |
var next_next_t; | |
if (next_t && new_score > next_t.score) { | |
next_next_t = scores.card[swap_to + 1]; | |
// skip equal elements to minimize number of swaps | |
while (next_next_t && next_t.score == next_next_t.score) { | |
swap_to = swap_to + 1; | |
next_next_t = scores.card[swap_to + 1]; | |
} | |
swap_scores_space_saver(scores, swap_from, swap_to); | |
return swap_to; | |
} | |
// no swap | |
return swap_from; | |
} | |
function insert_space_saver(scores, token) { | |
var t = scores.token[token]; | |
if (t) { | |
// token is already present, inc by 1 | |
// console.log(t); | |
inc_space_saver(scores, t); | |
} else { | |
// get t1, the cardinality-1 score (lowest score) | |
var t1 = scores.card[0]; | |
if (t1.token) { | |
// remove reference to old token | |
scores.token[t1.token] = null; | |
scores.overrestimates[t1.token] = null; | |
} | |
// set new token | |
t1.token = token; | |
scores.token[token] = t1; | |
// overrestimate is old score | |
scores.overrestimates[token] = t1.score; | |
// inc t1's score by 1 | |
inc_space_saver(scores, t1); | |
} | |
} | |
// TODO: only return the tokens above a certain | |
// confidence factor (low overrestimates) | |
function get_top_space_saver(scores, max) { | |
var card = scores.card | |
var output = [] | |
for (var i = card.length - 1; i >= 0 && max > 0; i--, max--) { | |
output.push(card[i].token); | |
} | |
return output; | |
} | |
//// TESTING //// | |
var PRIMES = [ | |
7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, | |
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, | |
137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, | |
199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, | |
277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, | |
359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, | |
439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, | |
521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, | |
607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, 677, | |
683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, | |
773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859, | |
863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, | |
967, 971, 977, 983, 991, 997 ] | |
function get_prime(num) { | |
var i = PRIMES.length; | |
var prime; | |
while (i >= 0) { | |
prime = PRIMES[i] | |
if (num % prime == 0) { | |
return prime | |
} | |
i = i - 1 | |
} | |
return num | |
} | |
// testing | |
var space_saver = new_space_saver(PRIMES.length * 2); | |
var ITERATION = 1000000; | |
for (var i = 0; i < ITERATION; i++) { | |
var num = Math.floor((Math.random() * 100000000) + 1); | |
num = get_prime(num); | |
// console.log(num.toString()); | |
insert_space_saver(space_saver, num.toString()); | |
} | |
var topk = get_top_space_saver(space_saver, PRIMES.length); | |
var i; | |
for (i = 0; i < PRIMES.length; i++) { | |
if (topk[i] != PRIMES[i].toString()) { | |
break; | |
} | |
} | |
console.log("Primes are good until: " + i + "; Max :" + (PRIMES.length - 1)); | |
var topk_words = topk_words("blah titi blah toto blah titi titi toto foo", 3); | |
console.log(topk_words); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment