-
-
Save automatonic/3725443 to your computer and use it in GitHub Desktop.
/* | |
This code is public domain. | |
The MurmurHash3 algorithm was created by Austin Appleby and put into the public domain. See http://code.google.com/p/smhasher/ | |
This C# variant was authored by | |
Elliott B. Edwards and was placed into the public domain as a gist | |
Status...Working on verification (Test Suite) | |
Set up to run as a LinqPad (linqpad.net) script (thus the ".Dump()" call) | |
*/ | |
public static class MurMurHash3 | |
{ | |
//Change to suit your needs | |
const uint seed = 144; | |
public static int Hash(Stream stream) | |
{ | |
const uint c1 = 0xcc9e2d51; | |
const uint c2 = 0x1b873593; | |
uint h1 = seed; | |
uint k1 = 0; | |
uint streamLength = 0; | |
using (BinaryReader reader = new BinaryReader(stream)) | |
{ | |
byte[] chunk = reader.ReadBytes(4); | |
while (chunk.Length > 0) | |
{ | |
streamLength += (uint)chunk.Length; | |
switch(chunk.Length) | |
{ | |
case 4: | |
/* Get four bytes from the input into an uint */ | |
k1 = (uint) | |
(chunk[0] | |
| chunk[1] << 8 | |
| chunk[2] << 16 | |
| chunk[3] << 24); | |
/* bitmagic hash */ | |
k1 *= c1; | |
k1 = rotl32(k1, 15); | |
k1 *= c2; | |
h1 ^= k1; | |
h1 = rotl32(h1, 13); | |
h1 = h1 * 5 + 0xe6546b64; | |
break; | |
case 3: | |
k1 = (uint) | |
(chunk[0] | |
| chunk[1] << 8 | |
| chunk[2] << 16); | |
k1 *= c1; | |
k1 = rotl32(k1, 15); | |
k1 *= c2; | |
h1 ^= k1; | |
break; | |
case 2: | |
k1 = (uint) | |
(chunk[0] | |
| chunk[1] << 8); | |
k1 *= c1; | |
k1 = rotl32(k1, 15); | |
k1 *= c2; | |
h1 ^= k1; | |
break; | |
case 1: | |
k1 = (uint)(chunk[0]); | |
k1 *= c1; | |
k1 = rotl32(k1, 15); | |
k1 *= c2; | |
h1 ^= k1; | |
break; | |
} | |
chunk = reader.ReadBytes(4); | |
} | |
} | |
// finalization, magic chants to wrap it all up | |
h1 ^= streamLength; | |
h1 = fmix(h1); | |
unchecked //ignore overflow | |
{ | |
return (int)h1; | |
} | |
} | |
private static uint rotl32(uint x, byte r) | |
{ | |
return (x << r) | (x >> (32 - r)); | |
} | |
private static uint fmix(uint h) | |
{ | |
h ^= h >> 16; | |
h *= 0x85ebca6b; | |
h ^= h >> 13; | |
h *= 0xc2b2ae35; | |
h ^= h >> 16; | |
return h; | |
} | |
} | |
void Main() | |
{ | |
Encoding encoding = new UTF8Encoding(); | |
byte[] input = encoding.GetBytes("The quick brown fox jumps over the lazy dog"); | |
using (MemoryStream stream = new MemoryStream(input)) | |
{ | |
int hash = MurMurHash3.Hash(stream); | |
new {Hash=hash, Bytes = BitConverter.GetBytes(hash) }.Dump("Result"); | |
} | |
} |
Just confirmed with @sapbucket, same result when using with UTF8. DONT USE THIS IMPLEMENTATION
I ended up using this one https://github.com/jitbit/MurmurHash.net
Hmm... MurMurHash3
looks fine to me. I can't reproduce @sapbucket's problem (with .NET 4.5.2, Windows 10, 64-bit) using any default encodings from System.Text.Encoding
or new UTF8Encoding()
public static int GetHash(string uniqueString) // "0,0,0,15,7" and "0,3,13,25,12" return different values
{
byte[] input = Encoding.UTF8.GetBytes(uniqueString); // Values stay different for System.Text.Encoding.Unicode, UTF8, etc.
using (var stream = new MemoryStream(input))
return MurMurHash3.Hash(stream);
}
It was my understanding that each unique byte[] would produce a unique hash.
This is incorrect understanding of hash as a concept. What you see is "hash collision" and it is NORMAL.
Hash is a function that maps all possible byte sequences of arbitrary length into a hash space - 32 bit for this. There are WAY more (countable but infinite) number of possible combinations of bytes that you can feed into hash functions but hash can output only 2^32 numbers. By pidgin hole principle - some inputs will be mapped to the same hash value.
if hash(x) = hash(y) => it doesn't mean that x=y.
Since this hash is a deterministic algorithm (for same input it always gives same output) - The only guarantee that hash give you is that:
if hash(x) <> hash(y) => it means that x <> y
You can think of hash as a guy that assigns a bucket number to the given byte string input.
Where hash space (all possible values, 2^32 here) - is total number of buckets
If you feed small number of input samples and your hash function is "good" (regarding random uniform distribution) - you can hope that all your inputs will get unique hash values assigned. But there is a chance that some buckets (hash values) will be associated with two or even more inputs. The "better" the hash - the lower such chances. Even with SHA1 and other crypto hashes this "hash collisions" are possible.
Imagine that your test set has more than 2^32 input samples - how hash function would be able to give you unique values for each of the input sample?
sapbucket, I notice that you're using the static Encoding class whereas automatonic is utilizing the instantiated UTF8 encoding... You could be encoding your values to bytes improperly