Created
April 3, 2014 16:18
-
-
Save axiak/9957599 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import mmh3 | |
import math | |
import random | |
import struct | |
try: | |
import numpy as np | |
from matplotlib import pyplot | |
except ImportError: | |
np = pyplot = None | |
CUSTOMER_CHURN_RATE = 1.1 | |
AVERAGE_CONTACT_COUNT = 1000 | |
SHARD_SIZE = 16 | |
def main(): | |
with_hash, without_hash = [], [] | |
for customer, contact_id in contact_id_customer_pair(100): | |
with_hash.append(key_with_hash(customer, contact_id)) | |
without_hash.append(simple_key(customer, contact_id)) | |
with_hash.sort() | |
without_hash.sort() | |
if len(sys.argv) > 1: | |
splits = int(sys.argv[1]) | |
else: | |
splits = 10 | |
show_distribution('with hash', with_hash, splits) | |
show_distribution('without hash', without_hash, splits) | |
inputs = { | |
#'with hash': with_hash, | |
'simple key': without_hash | |
} | |
graph_distribution(inputs, max_split=100) | |
def show_distribution(name, input, splits=10): | |
print name | |
for i in range(0, len(input), len(input) / splits): | |
print repr(input[i]) | |
print repr(input[-1]) | |
print '' | |
def graph_distribution(inputs, splits=(10, 100, 250), max_split=2 ** 63 - 1): | |
if np is None or pyplot is None: | |
print "Not graphing because matplotlib/numpy isn't installed. :-(" | |
return | |
keys = sorted(inputs.keys()) | |
pyplot.xlim(0, max_split) | |
pyplot.figure(1) | |
for i, key in enumerate(keys): | |
pyplot.subplot(len(keys), 1, i + 1) | |
key_result = [] | |
for value in inputs[key]: | |
key_result.append(struct.unpack(">L", value[:4])[0]) | |
print sorted(key_result) | |
key_result = np.array(sorted(key_result)) | |
for j, split in enumerate(splits): | |
bins = np.linspace(0, max_split, split) | |
pyplot.hist(key_result, bins, alpha=0.5, normed=True, histtype='step', label="{0} bins".format(split)) | |
if not j: | |
pyplot.hold(True) | |
pyplot.grid(True) | |
pyplot.legend() | |
pyplot.title(key) | |
pyplot.hold(False) | |
pyplot.show() | |
def key_with_hash(customer, contact_id): | |
customercontact_id = struct.pack(">lq", customer, contact_id) | |
shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE) | |
prefix = mmh3.hash(chr(shard) + struct.pack(">l", customer)) | |
return struct.pack('>l', prefix) + customercontact_id | |
def key_no_hash(customer, contact_id): | |
customercontact_id = struct.pack(">lq", customer, contact_id) | |
shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE) | |
return chr(shard) + customercontact_id | |
def simple_key(customer, contact_id): | |
return struct.pack('>lq', customer, contact_id) | |
def contact_id_customer_pair(num_customers): | |
sample = list(range(1, int(num_customers / CUSTOMER_CHURN_RATE))) | |
random.shuffle(sample) | |
customers = sorted(sample[:num_customers]) | |
for customer in customers: | |
for contact_id in contact_ids_for_customer(): | |
yield customer, contact_id | |
def contact_ids_for_customer(): | |
return list(range(1, int(power_law(1, AVERAGE_CONTACT_COUNT, 10)))) | |
def power_law(lower, upper, n): | |
y = random.random() | |
return upper - math.pow((upper ** (n + 1) - lower ** (n + 1)) * y + lower ** (n + 1), 1 / float(n + 1)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment