Created
February 25, 2015 01:05
-
-
Save simonrad/208f2c2fa6e7aeefaa6c to your computer and use it in GitHub Desktop.
shard_distribution_probability.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -------------------------------------------------- | |
| # Output: | |
| # We randomly distribute T tasks across N shards. | |
| # We compute the 95th percentile of (size of largest shard / size of average shard). | |
| # 2 shards, 2 tasks = 2.00 | |
| # 2 shards, 8 tasks = 1.75 | |
| # 2 shards, 40 tasks = 1.30 | |
| # 2 shards, 200 tasks = 1.14 | |
| # 5 shards, 5 tasks = 3.00 | |
| # 5 shards, 20 tasks = 2.00 | |
| # 5 shards, 100 tasks = 1.45 | |
| # 5 shards, 500 tasks = 1.21 | |
| # 20 shards, 20 tasks = 5.00 | |
| # 20 shards, 80 tasks = 2.50 | |
| # 20 shards, 400 tasks = 1.70 | |
| # 20 shards, 2000 tasks = 1.28 | |
| # 100 shards, 100 tasks = 6.00 | |
| # 100 shards, 400 tasks = 3.00 | |
| # 100 shards, 2000 tasks = 1.80 | |
| # 100 shards, 10000 tasks = 1.34 | |
| # 500 shards, 500 tasks = 6.00 | |
| # 500 shards, 2000 tasks = 3.25 | |
| # 500 shards, 10000 tasks = 1.90 | |
| # 500 shards, 50000 tasks = 1.39 | |
| # -------------------------------------------------- | |
| import random | |
| # Randomly distributes the items across the shards. | |
| # Returns the size of the largest shard / the size of the average shard. | |
| def run_test_once(num_shards, num_items): | |
| # Not really a map. | |
| item_to_shard_map = [random.randrange(num_shards) for i in range(num_items)] | |
| shard_counts = [0] * num_shards | |
| for shard in item_to_shard_map: | |
| shard_counts[shard] += 1 | |
| size_of_average_shard = (float(num_items) / num_shards) | |
| return max(shard_counts) / size_of_average_shard | |
| def run_test_many_times(num_shards, num_items, num_times_to_run = 1000): | |
| results_list = [run_test_once(num_shards, num_items) for i in range(num_times_to_run)] | |
| # Return the average result of run_test_once(). | |
| # return sum(results_list) / len(results_list) | |
| # Return the 95th percentile result of run_test_once(). | |
| return sorted(results_list)[len(results_list) * 95/100] | |
| print 'We randomly distribute T tasks across N shards.' | |
| print 'We compute the 95th percentile of (size of largest shard / size of average shard).' | |
| for num_shards in [2, 5, 20, 100, 500]: | |
| for num_items_multiple in [1, 4, 20, 100]: | |
| num_items = num_items_multiple * num_shards | |
| result = run_test_many_times(num_shards, num_items) | |
| print "%3d shards, %5d tasks = %.2f" % (num_shards, num_items, result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment