Created
October 11, 2012 16:14
-
-
Save devdazed/3873524 to your computer and use it in GitHub Desktop.
Simple Linear Probabilistic Counters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Simple Linear Probabilistic Counters | |
Credit for idea goes to: | |
http://highscalability.com/blog/2012/4/5/big-data-counting-how-to-count-a-billion-distinct-objects-us.html | |
http://highlyscalable.wordpress.com/2012/05/01/probabilistic-structures-web-analytics-data-mining/ | |
Installation: | |
pip install smhasher | |
pip install bitarray | |
""" | |
import smhasher | |
import math | |
import itertools | |
from bitarray import bitarray | |
class LPCounter(object): | |
""" | |
Real Time Linear Probabilistic Counter of Unique Items | |
""" | |
def __init__(self, max_space): | |
""" | |
Initializes the counter, max_space is the maximum amount of memory | |
(in KB) you want to use to maintain your counter, more memory=more | |
accurate | |
""" | |
self.bit_map = bitarray(8 * 1024 * max_space) | |
self.bit_map.setall(False) | |
def current_count(self): | |
""" | |
Gets the current value of the bitmap, to do that we follow the formula: | |
-size * ln(unset_bits/size) | |
""" | |
ratio = float(self.bit_map.count(False)) / float(self.bit_map.length()) | |
if ratio == 0.0: | |
return self.bit_map.length() | |
else: | |
return -self.bit_map.length() * math.log(ratio) | |
def increment(self, item): | |
""" | |
Counts an item | |
""" | |
mm_hash = smhasher.murmur3_x64_128(str(item)) | |
offset = mm_hash % self.bit_map.length() | |
self.bit_map[offset] = True | |
if __name__ == '__main__': | |
def run_counts(tries, size): | |
"""Run some counts""" | |
lpc = LPCounter(size) | |
print "%d Kilobyte Bitmap" % size | |
print " Incrementing %d items" % tries | |
for i in itertools.count(0): | |
if i > tries: | |
break | |
lpc.increment(i) | |
count = lpc.current_count() | |
print " Count: %d" % count | |
print " Actual: %d" % tries | |
print " Error: %f%%" % abs((1 - (float(count) / float(tries))) * 100) | |
run_counts(1000000000, 16384) |
Author
devdazed
commented
Oct 11, 2012
16384 Kilobyte Bitmap
Incrementing 1000000000 items
Count: 1000288512
Actual: 1000000000
Error: 0.028851%
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment