Skip to content

Instantly share code, notes, and snippets.

@simonrad
Created February 25, 2015 01:05
Show Gist options
  • Select an option

  • Save simonrad/208f2c2fa6e7aeefaa6c to your computer and use it in GitHub Desktop.

Select an option

Save simonrad/208f2c2fa6e7aeefaa6c to your computer and use it in GitHub Desktop.
shard_distribution_probability.py
# --------------------------------------------------
# Output:
# We randomly distribute T tasks across N shards.
# We compute the 95th percentile of (size of largest shard / size of average shard).
# 2 shards, 2 tasks = 2.00
# 2 shards, 8 tasks = 1.75
# 2 shards, 40 tasks = 1.30
# 2 shards, 200 tasks = 1.14
# 5 shards, 5 tasks = 3.00
# 5 shards, 20 tasks = 2.00
# 5 shards, 100 tasks = 1.45
# 5 shards, 500 tasks = 1.21
# 20 shards, 20 tasks = 5.00
# 20 shards, 80 tasks = 2.50
# 20 shards, 400 tasks = 1.70
# 20 shards, 2000 tasks = 1.28
# 100 shards, 100 tasks = 6.00
# 100 shards, 400 tasks = 3.00
# 100 shards, 2000 tasks = 1.80
# 100 shards, 10000 tasks = 1.34
# 500 shards, 500 tasks = 6.00
# 500 shards, 2000 tasks = 3.25
# 500 shards, 10000 tasks = 1.90
# 500 shards, 50000 tasks = 1.39
# --------------------------------------------------
import random
# Randomly distributes the items across the shards.
# Returns the size of the largest shard / the size of the average shard.
def run_test_once(num_shards, num_items):
# Not really a map.
item_to_shard_map = [random.randrange(num_shards) for i in range(num_items)]
shard_counts = [0] * num_shards
for shard in item_to_shard_map:
shard_counts[shard] += 1
size_of_average_shard = (float(num_items) / num_shards)
return max(shard_counts) / size_of_average_shard
def run_test_many_times(num_shards, num_items, num_times_to_run = 1000):
results_list = [run_test_once(num_shards, num_items) for i in range(num_times_to_run)]
# Return the average result of run_test_once().
# return sum(results_list) / len(results_list)
# Return the 95th percentile result of run_test_once().
return sorted(results_list)[len(results_list) * 95/100]
print 'We randomly distribute T tasks across N shards.'
print 'We compute the 95th percentile of (size of largest shard / size of average shard).'
for num_shards in [2, 5, 20, 100, 500]:
for num_items_multiple in [1, 4, 20, 100]:
num_items = num_items_multiple * num_shards
result = run_test_many_times(num_shards, num_items)
print "%3d shards, %5d tasks = %.2f" % (num_shards, num_items, result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment