Created
July 30, 2014 16:52
-
-
Save drdee/32b880a24bd2dd2d8d46 to your computer and use it in GitHub Desktop.
Cardinality estimator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import mmh3 | |
from functools import partial | |
from itertools import izip | |
def estimateCardinality(self, significant_bits) | |
''' | |
Taken and slightly adapted from http://blog.notdot.net/2012/09/Dam-Cool-Algorithms-Cardinality-Estimation | |
Estimates the number of unique elements in the input set values. | |
significant_bits: The number of bits of hash to use as a bucket number; there will be 2**k buckets. | |
key: by default this function will estimate the cardinality of the rdd, i.e how many records | |
there are, but you can also pass an optional key to estimate the cardinality of a particular field. | |
''' | |
def _trailingZeroes(num): | |
"""Counts the number of trailing 0 bits in num.""" | |
if num == 0: | |
return 32 | |
p = 0 | |
while (num >> p) & 1 == 0: | |
p += 1 | |
return p | |
def runEstimator(row, num_buckets, significant_bits): | |
max_zeroes = [0] * num_buckets | |
for record in records: | |
h = mmh3.hash(unicode(record)) | |
bucket = h & (num_buckets - 1) | |
bucket_hash = h >> significant_bits | |
max_zeroes[bucket] = max(max_zeroes[bucket], _trailingZeroes(bucket_hash)) | |
yield max_zeroes | |
max_buckets = [] | |
num_buckets = 2 ** significant_bits | |
max_zeroes = self.mapPartitions(partial(runEstimator, num_buckets=num_buckets, significant_bits=significant_bits)).collect() | |
for x in izip(*max_zeroes): | |
max_buckets.append(max(x)) | |
return math.ceil(2 ** (float(sum(max_buckets)) / num_buckets) * num_buckets * 0.79402) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment