Last active
August 29, 2015 14:05
-
-
Save sixeyed/5a1a9edac96132ade4ec to your computer and use it in GitHub Desktop.
MD5 hash distribution in C# - allocating to 1/200 for 0.5% splits
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//LinqPad | |
File.Delete("c:\\md5-list.txt"); | |
var hashes = new List<string>(); | |
var bytes = new byte[16]; | |
using (var rng = new RNGCryptoServiceProvider()) | |
{ | |
for (int i=0; i<500000; i++) | |
{ | |
rng.GetBytes(bytes); | |
var md5 = BitConverter.ToString(bytes).Replace("-", "").ToLower(); | |
hashes.Add(md5); | |
} | |
} | |
File.WriteAllLines("c:\\md5-list.txt", hashes.ToArray()); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// from: | |
// http://www.codeproject.com/Articles/21508/Distributed-Caching-Using-a-Hash-Algorithm | |
public class KeyDistributor<TKey> | |
{ | |
private int _numBuckets; | |
private int _pageSize; | |
private SHA1 _sha; | |
public int CalculateBucketIndex(TKey key) | |
{ | |
if (_numBuckets == 1) | |
{ | |
return 0; | |
} | |
// Create a SHA1 hash of the provided key | |
byte[] result; | |
byte[] data = BitConverter.GetBytes(key.GetHashCode()); | |
result = _sha.ComputeHash(data); | |
// Split the hash into 4 integer numbers | |
uint n1 = BitConverter.ToUInt32(result, 0); | |
uint n2 = BitConverter.ToUInt32(result, 4); | |
uint n3 = BitConverter.ToUInt32(result, 8); | |
uint n4 = BitConverter.ToUInt32(result, 12); | |
uint n5 = BitConverter.ToUInt32(result, 16); | |
// Apply XOR to derive a combined number | |
uint index = n1 ^ n2 ^ n3 ^ n4 ^ n5; | |
// Calculate the bucket index | |
int bucketIndex = Convert.ToInt32(Math.Floor((double)(index / _pageSize))); | |
return bucketIndex; | |
} | |
public KeyDistributor(int numBuckets) | |
{ | |
_numBuckets = numBuckets; | |
if (_numBuckets > 1) | |
{ | |
// Based on the number of buckets calculate the page size. | |
// It will be to link a given key to a specific bucket | |
_pageSize = Convert.ToInt32 | |
(Math.Ceiling((double)(uint.MaxValue / numBuckets))); | |
_sha = new SHA1Managed(); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Main() | |
{ | |
var hashes = new List<string>(File.ReadAllLines("c:\\md5-list.txt")); | |
int bucketCount = 200; | |
//init buckets: | |
var buckets = new int[bucketCount]; | |
for (int i=0; i<bucketCount; i++) | |
{ | |
buckets[i] = 0; | |
} | |
var distributor = new KeyDistributor<string>(bucketCount); | |
//distribute hashes: | |
foreach (var hash in hashes) | |
{ | |
var bucketIndex = distributor.CalculateBucketIndex(hash); | |
buckets[bucketIndex] = buckets[bucketIndex]+1; | |
} | |
//show distribution | |
for(int i=0; i<bucketCount; i++) | |
{ | |
string.Format("Bucket: {0}, count: {1}", i, buckets[i]).Dump(); | |
} | |
//pick some samples: | |
string.Format("Hash: {0}, bucket: {1}", hashes[501], distributor.CalculateBucketIndex(hashes[501])).Dump(); | |
string.Format("Hash: {0}, bucket: {1}", hashes[864], distributor.CalculateBucketIndex(hashes[864])).Dump(); | |
string.Format("Hash: {0}, bucket: {1}", hashes[2541], distributor.CalculateBucketIndex(hashes[2541])).Dump(); | |
string.Format("Hash: {0}, bucket: {1}", hashes[10876], distributor.CalculateBucketIndex(hashes[10876])).Dump(); | |
string.Format("Hash: {0}, bucket: {1}", hashes[450010], distributor.CalculateBucketIndex(hashes[450010])).Dump(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample use of Ivan Latunov's KeyDistributor to distribute 500K MD5 hashes among 200 buckets. Allows allocation of a single hash to a bucket which is 0.5% of the population.
Run GenerateMD5Hashes.linq in LinqPad to generate the hashes and write them to a file (~2 seconds); then repeatedly run TestDistribution.linq (~1 second per run).
You should find the buckets have ~2,500 entries in each, and on repeated runs the output should be the same.