Created
February 3, 2015 04:01
-
-
Save softwaredoug/aea69681f538d9780c82 to your computer and use it in GitHub Desktop.
Educational HyperLogLog demo in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stdio.h" | |
#include "stdlib.h" | |
unsigned int max(unsigned int a, unsigned int b) { | |
return a > b ? a : b; | |
} | |
unsigned int bucket(unsigned int val) { | |
return val & 0xF0000000 >> 28; | |
} | |
unsigned int trailingZeros(unsigned int val) { | |
unsigned int mask = 0x00000001; | |
unsigned int cnt = 0; | |
while (val) { | |
val = val & mask; | |
mask <<= 1; | |
cnt++; | |
} | |
return cnt; | |
} | |
void initBuckets(unsigned int* buckets) { | |
int i; | |
for (i = 0; i < 16; i++) { | |
buckets[i] = 0; | |
} | |
} | |
unsigned int cardinality(unsigned int* buckets) { | |
int i; | |
unsigned int cnt = 0; | |
for (i = 0; i < 16; i++) { | |
if (buckets[i]) { | |
cnt += (1 << (buckets[i] - 1)); // 2^buckets[i] | |
} | |
} | |
return cnt; | |
} | |
void diagnosticReport(unsigned int* buckets) { | |
int i; | |
for (i = 0; i < 16; i++) { | |
printf("bucket %i: %08x\n", i, buckets[i]); | |
} | |
printf("Cardinality %i\n", cardinality(buckets)); | |
} | |
int main() { | |
/*zero 16 buckets*/ | |
unsigned int buckets[16]; | |
initBuckets(buckets); | |
unsigned int inputNumber = 0; | |
unsigned int tryNo = 0; | |
while (1) { | |
printf("%ith Value for HLL\n", tryNo); | |
/*user enters a number, | |
* | |
* in real HLL we would hash the number, we're sorta expecting the user | |
* to bash on keys*/ | |
scanf("%u", &inputNumber); | |
/* | |
* hash this number to a bucket based on the <numbuckets> significant | |
* digits*/ | |
unsigned int buck = bucket(inputNumber); | |
printf("Hashed To Bucket %i\n", buck); | |
/* Write the largest number of trailing 0s we've seen thus far | |
* | |
* This is the magic of the hyperloglog, it gets very very hard | |
* to get more trailing zeros after some point | |
* | |
* Its much like flipping a coin | |
* */ | |
buckets[buck] = max(buckets[buck], trailingZeros(inputNumber)); | |
/* Print diagnistic, including cardinality. which is just a sum of | |
* 2^bucketVal over all buckets if bucketVal != 0 | |
* */ | |
diagnosticReport(buckets); | |
tryNo++; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The trailingZeros function doesn't seem to do what the function's name suggests it does. Here's an alternate version that might work for you.