Created
December 4, 2012 16:55
-
-
Save VorticonCmdr/4206141 to your computer and use it in GitHub Desktop.
hyperloglog for elasticsearch script facet plugin (alpha code)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
heap code: http://eloquentjavascript.net/appendix2.html | |
hyperloglog: https://github.com/sedictor/loglog | |
top k elements: http://stevehanov.ca/blog/index.php?id=122 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var HASH_LENGTH = 32, | |
HASH_K = 5; | |
function hash(str) { | |
var hash = 0; | |
for (var i = 0, l = str.length; i < l; i++) { | |
hash += str.charCodeAt(i); | |
hash += hash << 10; | |
hash ^= hash >> 6; | |
} | |
hash += hash << 3; | |
hash ^= hash >> 6; | |
hash += hash << 16; | |
return hash; | |
} | |
function scan1(bites) { | |
if (bites == 0) { | |
return HASH_LENGTH - HASH_K; | |
} | |
var offset = parseInt(Math.log(bites) / Math.log(2)); | |
offset = HASH_LENGTH - HASH_K - offset; | |
return offset; | |
} | |
function getBites(bites, start, end) { | |
var r = bites >> (HASH_LENGTH - end); | |
r = r & (Math.pow(2, end - start + 1) - 1); | |
return r; | |
} | |
var k = doc[key].value; | |
if (facet[k] == null || typeof facet[k] == 'undefined') { | |
facet[k] = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]; //HASH_LENGTH | |
} | |
var h = hash(doc[value].value), | |
j = getBites(h, 1, HASH_K) + 1, | |
w = getBites(h, HASH_K + 1, HASH_LENGTH); | |
w = scan1(w); | |
if (typeof facet[k][j] == 'undefined' || facet[k][j] < w) { | |
facet[k][j] = w; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var HASH_LENGTH = 32, // bites | |
HASH_K = 5; // HASH_LENGTH = 2 ^ HASH_K | |
var alpha = 0.697122946; // 1 / (32 * integral(0,inf)( (log2(1+1/(1+x)))^32 dx)) | |
function estimate(M) { | |
var Z = 0; | |
for (var i = 1; i <= HASH_LENGTH; i++) { | |
if (typeof M[i] != 'undefined' && M[i] != 0) { | |
Z += 1 / Math.pow(2, M[i]); | |
} else { | |
Z += 1; | |
} | |
} | |
Z = alpha * HASH_LENGTH * HASH_LENGTH / Z; | |
return parseInt(Z); | |
} | |
function BinaryHeap(scoreFunction){ | |
this.content = []; | |
this.scoreFunction = scoreFunction; | |
} | |
BinaryHeap.prototype = { | |
push: function(element) { | |
// Add the new element to the end of the array. | |
this.content.push(element); | |
// Allow it to bubble up. | |
this.bubbleUp(this.content.length - 1); | |
}, | |
pop: function() { | |
// Store the first element so we can return it later. | |
var result = this.content[0]; | |
// Get the element at the end of the array. | |
var end = this.content.pop(); | |
// If there are any elements left, put the end element at the | |
// start, and let it sink down. | |
if (this.content.length > 0) { | |
this.content[0] = end; | |
this.sinkDown(0); | |
} | |
return result; | |
}, | |
remove: function(node) { | |
var len = this.content.length; | |
// To remove a value, we must search through the array to find | |
// it. | |
for (var i = 0; i < len; i++) { | |
if (this.content[i] == node) { | |
// When it is found, the process seen in 'pop' is repeated | |
// to fill up the hole. | |
var end = this.content.pop(); | |
if (i != len - 1) { | |
this.content[i] = end; | |
if (this.scoreFunction(end) < this.scoreFunction(node)) | |
this.bubbleUp(i); | |
else | |
this.sinkDown(i); | |
} | |
return; | |
} | |
} | |
throw new Error("Node not found."); | |
}, | |
size: function() { | |
return this.content.length; | |
}, | |
bubbleUp: function(n) { | |
// Fetch the element that has to be moved. | |
var element = this.content[n]; | |
// When at 0, an element can not go up any further. | |
while (n > 0) { | |
// Compute the parent element's index, and fetch it. | |
var parentN = Math.floor((n + 1) / 2) - 1, | |
parent = this.content[parentN]; | |
// Swap the elements if the parent is greater. | |
if (this.scoreFunction(element) < this.scoreFunction(parent)) { | |
this.content[parentN] = element; | |
this.content[n] = parent; | |
// Update 'n' to continue at the new position. | |
n = parentN; | |
} | |
// Found a parent that is less, no need to move it further. | |
else { | |
break; | |
} | |
} | |
}, | |
sinkDown: function(n) { | |
// Look up the target element and its score. | |
var length = this.content.length, | |
element = this.content[n], | |
elemScore = this.scoreFunction(element); | |
while(true) { | |
// Compute the indices of the child elements. | |
var child2N = (n + 1) * 2, child1N = child2N - 1; | |
// This is used to store the new position of the element, | |
// if any. | |
var swap = null; | |
// If the first child exists (is inside the array)... | |
if (child1N < length) { | |
// Look it up and compute its score. | |
var child1 = this.content[child1N], | |
child1Score = this.scoreFunction(child1); | |
// If the score is less than our element's, we need to swap. | |
if (child1Score < elemScore) | |
swap = child1N; | |
} | |
// Do the same checks for the other child. | |
if (child2N < length) { | |
var child2 = this.content[child2N], | |
child2Score = this.scoreFunction(child2); | |
if (child2Score < (swap == null ? elemScore : child1Score)) | |
swap = child2N; | |
} | |
// If the element needs to be moved, swap it, and continue. | |
if (swap != null) { | |
this.content[n] = this.content[swap]; | |
this.content[swap] = element; | |
n = swap; | |
} | |
// Otherwise, we are done. | |
else { | |
break; | |
} | |
} | |
} | |
}; | |
var l = facets.length; | |
var d = facets[0]; | |
for (var i=1; i<l; i++) { | |
var f = facets[i]; | |
for (k in f) { | |
if ((typeof d[k] != 'undefined') && (d[k] != null)) { | |
var M = d[k]; | |
var N = f[k]; | |
for (var j=0; j<N.length; j++) { | |
if (M[j] < N[j]) { | |
M[j] = N[j]; | |
} | |
} | |
d[k] = M; | |
} | |
else { | |
d[k] = f[k]; | |
} | |
} | |
} | |
var size = 10; | |
var heap = new BinaryHeap(function(x){return x;}); | |
var v = 0; | |
var h = {}; | |
for (k in d) { | |
v = estimate(d[k]); | |
h = {}; | |
h['url'] = k; | |
h['count'] = v; | |
if ((heap.size() < size) || (v > heap.content[0]['count'])) { | |
if (heap.size() == size) { | |
heap.pop(); | |
} | |
heap.push(h); | |
} | |
} | |
facet = heap.content; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
prerequistes: | |
https://github.com/imotov/elasticsearch-facet-script | |
https://github.com/elasticsearch/elasticsearch-lang-javascript | |
"facets": { | |
"facet1": { | |
"script": { | |
"map_script": "hll_map", | |
"reduce_script" : "my_reduce", | |
"params" : { | |
"facet" : {}, | |
"key" : "term", | |
"value": "unique" | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey there, it has been a while since you posted this gist. Any feedback/recommendation on estimating the number of uniq values of a field? Do you still use this JS code?