Created
January 7, 2014 14:29
-
-
Save richardkundl/8300092 to your computer and use it in GitHub Desktop.
Bloom filter implementation in c#.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace BloomFilter | |
{ | |
using System; | |
using System.Collections; | |
/// <summary> | |
/// Bloom filter. | |
/// </summary> | |
/// <typeparam name="T">Item type </typeparam> | |
public class Filter<T> | |
{ | |
private readonly int _hashFunctionCount; | |
private readonly BitArray _hashBits; | |
private readonly HashFunction _getHashSecondary; | |
/// <summary> | |
/// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. | |
/// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions. | |
/// </summary> | |
/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param> | |
public Filter(int capacity) | |
: this(capacity, null) | |
{ | |
} | |
/// <summary> | |
/// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. | |
/// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions. | |
/// </summary> | |
/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param> | |
/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param> | |
public Filter(int capacity, float errorRate) | |
: this(capacity, errorRate, null) | |
{ | |
} | |
/// <summary> | |
/// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. | |
/// </summary> | |
/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param> | |
/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param> | |
public Filter(int capacity, HashFunction hashFunction) | |
: this(capacity, BestErrorRate(capacity), hashFunction) | |
{ | |
} | |
/// <summary> | |
/// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. | |
/// </summary> | |
/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param> | |
/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param> | |
/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param> | |
public Filter(int capacity, float errorRate, HashFunction hashFunction) | |
: this(capacity, errorRate, hashFunction, BestM(capacity, errorRate), BestK(capacity, errorRate)) | |
{ | |
} | |
/// <summary> | |
/// Creates a new Bloom filter. | |
/// </summary> | |
/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param> | |
/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param> | |
/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param> | |
/// <param name="m">The number of elements in the BitArray.</param> | |
/// <param name="k">The number of hash functions to use.</param> | |
public Filter(int capacity, float errorRate, HashFunction hashFunction, int m, int k) | |
{ | |
// validate the params are in range | |
if (capacity < 1) | |
{ | |
throw new ArgumentOutOfRangeException("capacity", capacity, "capacity must be > 0"); | |
} | |
if (errorRate >= 1 || errorRate <= 0) | |
{ | |
throw new ArgumentOutOfRangeException("errorRate", errorRate, string.Format("errorRate must be between 0 and 1, exclusive. Was {0}", errorRate)); | |
} | |
// from overflow in bestM calculation | |
if (m < 1) | |
{ | |
throw new ArgumentOutOfRangeException(string.Format("The provided capacity and errorRate values would result in an array of length > int.MaxValue. Please reduce either of these values. Capacity: {0}, Error rate: {1}", capacity, errorRate)); | |
} | |
// set the secondary hash function | |
if (hashFunction == null) | |
{ | |
if (typeof(T) == typeof(string)) | |
{ | |
this._getHashSecondary = HashString; | |
} | |
else if (typeof(T) == typeof(int)) | |
{ | |
this._getHashSecondary = HashInt32; | |
} | |
else | |
{ | |
throw new ArgumentNullException("hashFunction", "Please provide a hash function for your type T, when T is not a string or int."); | |
} | |
} | |
else | |
{ | |
this._getHashSecondary = hashFunction; | |
} | |
this._hashFunctionCount = k; | |
this._hashBits = new BitArray(m); | |
} | |
/// <summary> | |
/// A function that can be used to hash input. | |
/// </summary> | |
/// <param name="input">The values to be hashed.</param> | |
/// <returns>The resulting hash code.</returns> | |
public delegate int HashFunction(T input); | |
/// <summary> | |
/// The ratio of false to true bits in the filter. E.g., 1 true bit in a 10 bit filter means a truthiness of 0.1. | |
/// </summary> | |
public double Truthiness | |
{ | |
get | |
{ | |
return (double)this.TrueBits() / this._hashBits.Count; | |
} | |
} | |
/// <summary> | |
/// Adds a new item to the filter. It cannot be removed. | |
/// </summary> | |
/// <param name="item">The item.</param> | |
public void Add(T item) | |
{ | |
// start flipping bits for each hash of item | |
int primaryHash = item.GetHashCode(); | |
int secondaryHash = this._getHashSecondary(item); | |
for (int i = 0; i < this._hashFunctionCount; i++) | |
{ | |
int hash = this.ComputeHash(primaryHash, secondaryHash, i); | |
this._hashBits[hash] = true; | |
} | |
} | |
/// <summary> | |
/// Checks for the existance of the item in the filter for a given probability. | |
/// </summary> | |
/// <param name="item"> The item. </param> | |
/// <returns> The <see cref="bool"/>. </returns> | |
public bool Contains(T item) | |
{ | |
int primaryHash = item.GetHashCode(); | |
int secondaryHash = this._getHashSecondary(item); | |
for (int i = 0; i < this._hashFunctionCount; i++) | |
{ | |
int hash = this.ComputeHash(primaryHash, secondaryHash, i); | |
if (this._hashBits[hash] == false) | |
{ | |
return false; | |
} | |
} | |
return true; | |
} | |
/// <summary> | |
/// The best k. | |
/// </summary> | |
/// <param name="capacity"> The capacity. </param> | |
/// <param name="errorRate"> The error rate. </param> | |
/// <returns> The <see cref="int"/>. </returns> | |
private static int BestK(int capacity, float errorRate) | |
{ | |
return (int)Math.Round(Math.Log(2.0) * BestM(capacity, errorRate) / capacity); | |
} | |
/// <summary> | |
/// The best m. | |
/// </summary> | |
/// <param name="capacity"> The capacity. </param> | |
/// <param name="errorRate"> The error rate. </param> | |
/// <returns> The <see cref="int"/>. </returns> | |
private static int BestM(int capacity, float errorRate) | |
{ | |
return (int)Math.Ceiling(capacity * Math.Log(errorRate, (1.0 / Math.Pow(2, Math.Log(2.0))))); | |
} | |
/// <summary> | |
/// The best error rate. | |
/// </summary> | |
/// <param name="capacity"> The capacity. </param> | |
/// <returns> The <see cref="float"/>. </returns> | |
private static float BestErrorRate(int capacity) | |
{ | |
float c = (float)(1.0 / capacity); | |
if (c != 0) | |
{ | |
return c; | |
} | |
// default | |
// http://www.cs.princeton.edu/courses/archive/spring02/cs493/lec7.pdf | |
return (float)Math.Pow(0.6185, int.MaxValue / capacity); | |
} | |
/// <summary> | |
/// Hashes a 32-bit signed int using Thomas Wang's method v3.1 (http://www.concentric.net/~Ttwang/tech/inthash.htm). | |
/// Runtime is suggested to be 11 cycles. | |
/// </summary> | |
/// <param name="input">The integer to hash.</param> | |
/// <returns>The hashed result.</returns> | |
private static int HashInt32(T input) | |
{ | |
uint? x = input as uint?; | |
unchecked | |
{ | |
x = ~x + (x << 15); // x = (x << 15) - x- 1, as (~x) + y is equivalent to y - x - 1 in two's complement representation | |
x = x ^ (x >> 12); | |
x = x + (x << 2); | |
x = x ^ (x >> 4); | |
x = x * 2057; // x = (x + (x << 3)) + (x<< 11); | |
x = x ^ (x >> 16); | |
return (int)x; | |
} | |
} | |
/// <summary> | |
/// Hashes a string using Bob Jenkin's "One At A Time" method from Dr. Dobbs (http://burtleburtle.net/bob/hash/doobs.html). | |
/// Runtime is suggested to be 9x+9, where x = input.Length. | |
/// </summary> | |
/// <param name="input">The string to hash.</param> | |
/// <returns>The hashed result.</returns> | |
private static int HashString(T input) | |
{ | |
string s = input as string; | |
int hash = 0; | |
for (int i = 0; i < s.Length; i++) | |
{ | |
hash += s[i]; | |
hash += (hash << 10); | |
hash ^= (hash >> 6); | |
} | |
hash += (hash << 3); | |
hash ^= (hash >> 11); | |
hash += (hash << 15); | |
return hash; | |
} | |
/// <summary> | |
/// The true bits. | |
/// </summary> | |
/// <returns> The <see cref="int"/>. </returns> | |
private int TrueBits() | |
{ | |
int output = 0; | |
foreach (bool bit in this._hashBits) | |
{ | |
if (bit == true) | |
{ | |
output++; | |
} | |
} | |
return output; | |
} | |
/// <summary> | |
/// Performs Dillinger and Manolios double hashing. | |
/// </summary> | |
/// <param name="primaryHash"> The primary hash. </param> | |
/// <param name="secondaryHash"> The secondary hash. </param> | |
/// <param name="i"> The i. </param> | |
/// <returns> The <see cref="int"/>. </returns> | |
private int ComputeHash(int primaryHash, int secondaryHash, int i) | |
{ | |
int resultingHash = (primaryHash + (i * secondaryHash)) % this._hashBits.Count; | |
return Math.Abs((int)resultingHash); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// default usage | |
int capacity = 2000000; | |
var filter = new Filter<string>(capacity); | |
filter.Add("content"); | |
... | |
filter.Contains("content"); | |
// profit |
Either he got it from here: https://bloomfilter.codeplex.com/SourceControl/latest#BloomFilter/Filter.cs which specifies a license, OR
Microsoft got their project from Richard.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@richardkundl Thank you for providing this reference implementation. Would you be able to specify an explicit license for this gist so I know how the code can be used? Thank you!.