axiak · April 3, 2014 16:18
diff --git a/key_distribution.py b/key_distribution.py
 #!/usr/bin/env python
 import sys
 import mmh3
 import math
 import random
 import struct

 try:
    import numpy as np
    from matplotlib import pyplot
 except ImportError:
    np = pyplot = None


 CUSTOMER_CHURN_RATE = 1.1
 AVERAGE_CONTACT_COUNT = 1000
 SHARD_SIZE = 16


 def main():
    with_hash, without_hash = [], []

    for customer, contact_id in contact_id_customer_pair(100):
        with_hash.append(key_with_hash(customer, contact_id))
        without_hash.append(simple_key(customer, contact_id))

    with_hash.sort()
    without_hash.sort()

    if len(sys.argv) > 1:
        splits = int(sys.argv[1])
    else:
        splits = 10

    show_distribution('with hash', with_hash, splits)
    show_distribution('without hash', without_hash, splits)

    inputs = {
        #'with hash': with_hash,
        'simple key': without_hash
    }
    graph_distribution(inputs, max_split=100)


 def show_distribution(name, input, splits=10):
    print name
    for i in range(0, len(input), len(input) / splits):
        print repr(input[i])
    print repr(input[-1])
    print ''


 def graph_distribution(inputs, splits=(10, 100, 250), max_split=2 ** 63 - 1):
    if np is None or pyplot is None:
        print "Not graphing because matplotlib/numpy isn't installed. :-("
        return

    keys = sorted(inputs.keys())
    pyplot.xlim(0, max_split)

    pyplot.figure(1)

    for i, key in enumerate(keys):
        pyplot.subplot(len(keys), 1, i + 1)
        key_result = []
        for value in inputs[key]:
            key_result.append(struct.unpack(">L", value[:4])[0])
        print sorted(key_result)
        key_result = np.array(sorted(key_result))

        for j, split in enumerate(splits):
            bins = np.linspace(0, max_split, split)
            pyplot.hist(key_result, bins, alpha=0.5, normed=True, histtype='step', label="{0} bins".format(split))
            if not j:
                pyplot.hold(True)
        pyplot.grid(True)
        pyplot.legend()
        pyplot.title(key)
        pyplot.hold(False)

    pyplot.show()


 def key_with_hash(customer, contact_id):
    customercontact_id = struct.pack(">lq", customer, contact_id)
    shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE)
    prefix = mmh3.hash(chr(shard) + struct.pack(">l", customer))
    return struct.pack('>l', prefix) + customercontact_id


 def key_no_hash(customer, contact_id):
    customercontact_id = struct.pack(">lq", customer, contact_id)
    shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE)
    return chr(shard) + customercontact_id


 def simple_key(customer, contact_id):
    return struct.pack('>lq', customer, contact_id)


 def contact_id_customer_pair(num_customers):
    sample = list(range(1, int(num_customers / CUSTOMER_CHURN_RATE)))
    random.shuffle(sample)
    customers = sorted(sample[:num_customers])
    for customer in customers:
        for contact_id in contact_ids_for_customer():
            yield customer, contact_id


 def contact_ids_for_customer():
    return list(range(1, int(power_law(1, AVERAGE_CONTACT_COUNT, 10))))


 def power_law(lower, upper, n):
    y = random.random()
    return upper - math.pow((upper ** (n + 1) - lower ** (n + 1)) * y + lower ** (n + 1), 1 / float(n + 1))


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	import sys
	import mmh3
	import math
	import random
	import struct

	try:
	import numpy as np
	from matplotlib import pyplot
	except ImportError:
	np = pyplot = None


	CUSTOMER_CHURN_RATE = 1.1
	AVERAGE_CONTACT_COUNT = 1000
	SHARD_SIZE = 16


	def main():
	with_hash, without_hash = [], []

	for customer, contact_id in contact_id_customer_pair(100):
	with_hash.append(key_with_hash(customer, contact_id))
	without_hash.append(simple_key(customer, contact_id))

	with_hash.sort()
	without_hash.sort()

	if len(sys.argv) > 1:
	splits = int(sys.argv[1])
	else:
	splits = 10

	show_distribution('with hash', with_hash, splits)
	show_distribution('without hash', without_hash, splits)

	inputs = {
	#'with hash': with_hash,
	'simple key': without_hash
	}
	graph_distribution(inputs, max_split=100)


	def show_distribution(name, input, splits=10):
	print name
	for i in range(0, len(input), len(input) / splits):
	print repr(input[i])
	print repr(input[-1])
	print ''


	def graph_distribution(inputs, splits=(10, 100, 250), max_split=2 ** 63 - 1):
	if np is None or pyplot is None:
	print "Not graphing because matplotlib/numpy isn't installed. :-("
	return

	keys = sorted(inputs.keys())
	pyplot.xlim(0, max_split)

	pyplot.figure(1)

	for i, key in enumerate(keys):
	pyplot.subplot(len(keys), 1, i + 1)
	key_result = []
	for value in inputs[key]:
	key_result.append(struct.unpack(">L", value[:4])[0])
	print sorted(key_result)
	key_result = np.array(sorted(key_result))

	for j, split in enumerate(splits):
	bins = np.linspace(0, max_split, split)
	pyplot.hist(key_result, bins, alpha=0.5, normed=True, histtype='step', label="{0} bins".format(split))
	if not j:
	pyplot.hold(True)
	pyplot.grid(True)
	pyplot.legend()
	pyplot.title(key)
	pyplot.hold(False)

	pyplot.show()


	def key_with_hash(customer, contact_id):
	customercontact_id = struct.pack(">lq", customer, contact_id)
	shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE)
	prefix = mmh3.hash(chr(shard) + struct.pack(">l", customer))
	return struct.pack('>l', prefix) + customercontact_id


	def key_no_hash(customer, contact_id):
	customercontact_id = struct.pack(">lq", customer, contact_id)
	shard = abs(mmh3.hash(customercontact_id) % SHARD_SIZE)
	return chr(shard) + customercontact_id


	def simple_key(customer, contact_id):
	return struct.pack('>lq', customer, contact_id)


	def contact_id_customer_pair(num_customers):
	sample = list(range(1, int(num_customers / CUSTOMER_CHURN_RATE)))
	random.shuffle(sample)
	customers = sorted(sample[:num_customers])
	for customer in customers:
	for contact_id in contact_ids_for_customer():
	yield customer, contact_id


	def contact_ids_for_customer():
	return list(range(1, int(power_law(1, AVERAGE_CONTACT_COUNT, 10))))


	def power_law(lower, upper, n):
	y = random.random()
	return upper - math.pow((upper (n + 1) - lower (n + 1)) * y + lower ** (n + 1), 1 / float(n + 1))


	if __name__ == '__main__':
	main()