richardkundl · January 7, 2014 14:29 · sarus · May 14, 2015 · miriver · Feb 27, 2016
diff --git a/BloomFilter.cs b/BloomFilter.cs
 namespace BloomFilter
 {
 	using System;
 	using System.Collections;

 	/// <summary>
 	/// Bloom filter.
 	/// </summary>
 	/// <typeparam name="T">Item type </typeparam>
 	public class Filter<T>
 	{
 		private readonly int _hashFunctionCount;
 		private readonly BitArray _hashBits;
 		private readonly HashFunction _getHashSecondary;

 		/// <summary>
 		/// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
 		/// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions.
 		/// </summary>
 		/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
 		public Filter(int capacity)
 			: this(capacity, null)
 		{
 		}

 		/// <summary>
 		/// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
 		/// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions.
 		/// </summary>
 		/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
 		/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param>
 		public Filter(int capacity, float errorRate)
 			: this(capacity, errorRate, null)
 		{
 		}

 		/// <summary>
 		/// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
 		/// </summary>
 		/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
 		/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param>
 		public Filter(int capacity, HashFunction hashFunction)
 			: this(capacity, BestErrorRate(capacity), hashFunction)
 		{
 		}

 		/// <summary>
 		/// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
 		/// </summary>
 		/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
 		/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param>
 		/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param>
 		public Filter(int capacity, float errorRate, HashFunction hashFunction)
 			: this(capacity, errorRate, hashFunction, BestM(capacity, errorRate), BestK(capacity, errorRate))
 		{
 		}

 		/// <summary>
 		/// Creates a new Bloom filter.
 		/// </summary>
 		/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
 		/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param>
 		/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param>
 		/// <param name="m">The number of elements in the BitArray.</param>
 		/// <param name="k">The number of hash functions to use.</param>
 		public Filter(int capacity, float errorRate, HashFunction hashFunction, int m, int k)
 		{
 			// validate the params are in range
 			if (capacity < 1)
 			{
 				throw new ArgumentOutOfRangeException("capacity", capacity, "capacity must be > 0");
 			}

 			if (errorRate >= 1 || errorRate <= 0)
 			{
 				throw new ArgumentOutOfRangeException("errorRate", errorRate, string.Format("errorRate must be between 0 and 1, exclusive. Was {0}", errorRate));
 			}

 			// from overflow in bestM calculation
 			if (m < 1)
 			{
 				throw new ArgumentOutOfRangeException(string.Format("The provided capacity and errorRate values would result in an array of length > int.MaxValue. Please reduce either of these values. Capacity: {0}, Error rate: {1}", capacity, errorRate));
 			}

 			// set the secondary hash function
 			if (hashFunction == null)
 			{
 				if (typeof(T) == typeof(string))
 				{
 					this._getHashSecondary = HashString;
 				}
 				else if (typeof(T) == typeof(int))
 				{
 					this._getHashSecondary = HashInt32;
 				}
 				else
 				{
 					throw new ArgumentNullException("hashFunction", "Please provide a hash function for your type T, when T is not a string or int.");
 				}
 			}
 			else
 			{
 				this._getHashSecondary = hashFunction;
 			}

 			this._hashFunctionCount = k;
 			this._hashBits = new BitArray(m);
 		}

 		/// <summary>
 		/// A function that can be used to hash input.
 		/// </summary>
 		/// <param name="input">The values to be hashed.</param>
 		/// <returns>The resulting hash code.</returns>
 		public delegate int HashFunction(T input);

 		/// <summary>
 		/// The ratio of false to true bits in the filter. E.g., 1 true bit in a 10 bit filter means a truthiness of 0.1.
 		/// </summary>
 		public double Truthiness
 		{
 			get
 			{
 				return (double)this.TrueBits() / this._hashBits.Count;
 			}
 		}

 		/// <summary>
 		/// Adds a new item to the filter. It cannot be removed.
 		/// </summary>
 		/// <param name="item">The item.</param>
 		public void Add(T item)
 		{
 			// start flipping bits for each hash of item
 			int primaryHash = item.GetHashCode();
 			int secondaryHash = this._getHashSecondary(item);
 			for (int i = 0; i < this._hashFunctionCount; i++)
 			{
 				int hash = this.ComputeHash(primaryHash, secondaryHash, i);
 				this._hashBits[hash] = true;
 			}
 		}

 		/// <summary>
 		/// Checks for the existance of the item in the filter for a given probability.
 		/// </summary>
 		/// <param name="item"> The item. </param>
 		/// <returns> The <see cref="bool"/>. </returns>
 		public bool Contains(T item)
 		{
 			int primaryHash = item.GetHashCode();
 			int secondaryHash = this._getHashSecondary(item);
 			for (int i = 0; i < this._hashFunctionCount; i++)
 			{
 				int hash = this.ComputeHash(primaryHash, secondaryHash, i);
 				if (this._hashBits[hash] == false)
 				{
 					return false;
 				}
 			}

 			return true;
 		}

 		/// <summary>
 		/// The best k.
 		/// </summary>
 		/// <param name="capacity"> The capacity. </param>
 		/// <param name="errorRate"> The error rate. </param>
 		/// <returns> The <see cref="int"/>. </returns>
 		private static int BestK(int capacity, float errorRate)
 		{
 			return (int)Math.Round(Math.Log(2.0) * BestM(capacity, errorRate) / capacity);
 		}

 		/// <summary>
 		/// The best m.
 		/// </summary>
 		/// <param name="capacity"> The capacity. </param>
 		/// <param name="errorRate"> The error rate. </param>
 		/// <returns> The <see cref="int"/>. </returns>
 		private static int BestM(int capacity, float errorRate)
 		{
 			return (int)Math.Ceiling(capacity * Math.Log(errorRate, (1.0 / Math.Pow(2, Math.Log(2.0)))));
 		}

 		/// <summary>
 		/// The best error rate.
 		/// </summary>
 		/// <param name="capacity"> The capacity. </param>
 		/// <returns> The <see cref="float"/>. </returns>
 		private static float BestErrorRate(int capacity)
 		{
 			float c = (float)(1.0 / capacity);
 			if (c != 0)
 			{
 				return c;
 			}

 			// default
 			// http://www.cs.princeton.edu/courses/archive/spring02/cs493/lec7.pdf
 			return (float)Math.Pow(0.6185, int.MaxValue / capacity);
 		}

 		/// <summary>
 		/// Hashes a 32-bit signed int using Thomas Wang's method v3.1 (http://www.concentric.net/~Ttwang/tech/inthash.htm).
 		/// Runtime is suggested to be 11 cycles. 
 		/// </summary>
 		/// <param name="input">The integer to hash.</param>
 		/// <returns>The hashed result.</returns>
 		private static int HashInt32(T input)
 		{
 			uint? x = input as uint?;
 			unchecked
 			{
 				x = ~x + (x << 15); // x = (x << 15) - x- 1, as (~x) + y is equivalent to y - x - 1 in two's complement representation
 				x = x ^ (x >> 12);
 				x = x + (x << 2);
 				x = x ^ (x >> 4);
 				x = x * 2057; // x = (x + (x << 3)) + (x<< 11);
 				x = x ^ (x >> 16);
 				return (int)x;
 			}
 		}

 		/// <summary>
 		/// Hashes a string using Bob Jenkin's "One At A Time" method from Dr. Dobbs (http://burtleburtle.net/bob/hash/doobs.html).
 		/// Runtime is suggested to be 9x+9, where x = input.Length. 
 		/// </summary>
 		/// <param name="input">The string to hash.</param>
 		/// <returns>The hashed result.</returns>
 		private static int HashString(T input)
 		{
 			string s = input as string;
 			int hash = 0;

 			for (int i = 0; i < s.Length; i++)
 			{
 				hash += s[i];
 				hash += (hash << 10);
 				hash ^= (hash >> 6);
 			}

 			hash += (hash << 3);
 			hash ^= (hash >> 11);
 			hash += (hash << 15);
 			return hash;
 		}

 		/// <summary>
 		/// The true bits.
 		/// </summary>
 		/// <returns> The <see cref="int"/>. </returns>
 		private int TrueBits()
 		{
 			int output = 0;
 			foreach (bool bit in this._hashBits)
 			{
 				if (bit == true)
 				{
 					output++;
 				}
 			}

 			return output;
 		}

 		/// <summary>
 		/// Performs Dillinger and Manolios double hashing. 
 		/// </summary>
 		/// <param name="primaryHash"> The primary hash. </param>
 		/// <param name="secondaryHash"> The secondary hash. </param>
 		/// <param name="i"> The i. </param>
 		/// <returns> The <see cref="int"/>. </returns>
 		private int ComputeHash(int primaryHash, int secondaryHash, int i)
 		{
 			int resultingHash = (primaryHash + (i * secondaryHash)) % this._hashBits.Count;
 			return Math.Abs((int)resultingHash);
 		}
 	}
 }
diff --git a/Usage.cs b/Usage.cs
 // default usage
 int capacity = 2000000;
 var filter = new Filter<string>(capacity);
 filter.Add("content");
 ...
 filter.Contains("content");
 // profit
	namespace BloomFilter
	{
	using System;
	using System.Collections;

	/// <summary>
	/// Bloom filter.
	/// </summary>
	/// <typeparam name="T">Item type </typeparam>
	public class Filter<T>
	{
	private readonly int _hashFunctionCount;
	private readonly BitArray _hashBits;
	private readonly HashFunction _getHashSecondary;

	/// <summary>
	/// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
	/// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions.
	/// </summary>
	/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
	public Filter(int capacity)
	: this(capacity, null)
	{
	}

	/// <summary>
	/// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
	/// A secondary hash function will be provided for you if your type T is either string or int. Otherwise an exception will be thrown. If you are not using these types please use the overload that supports custom hash functions.
	/// </summary>
	/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
	/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param>
	public Filter(int capacity, float errorRate)
	: this(capacity, errorRate, null)
	{
	}

	/// <summary>
	/// Creates a new Bloom filter, specifying an error rate of 1/capacity, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
	/// </summary>
	/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
	/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param>
	public Filter(int capacity, HashFunction hashFunction)
	: this(capacity, BestErrorRate(capacity), hashFunction)
	{
	}

	/// <summary>
	/// Creates a new Bloom filter, using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
	/// </summary>
	/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
	/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param>
	/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param>
	public Filter(int capacity, float errorRate, HashFunction hashFunction)
	: this(capacity, errorRate, hashFunction, BestM(capacity, errorRate), BestK(capacity, errorRate))
	{
	}

	/// <summary>
	/// Creates a new Bloom filter.
	/// </summary>
	/// <param name="capacity">The anticipated number of items to be added to the filter. More than this number of items can be added, but the error rate will exceed what is expected.</param>
	/// <param name="errorRate">The accepable false-positive rate (e.g., 0.01F = 1%)</param>
	/// <param name="hashFunction">The function to hash the input values. Do not use GetHashCode(). If it is null, and T is string or int a hash function will be provided for you.</param>
	/// <param name="m">The number of elements in the BitArray.</param>
	/// <param name="k">The number of hash functions to use.</param>
	public Filter(int capacity, float errorRate, HashFunction hashFunction, int m, int k)
	{
	// validate the params are in range
	if (capacity < 1)
	{
	throw new ArgumentOutOfRangeException("capacity", capacity, "capacity must be > 0");
	}

	if (errorRate >= 1 \|\| errorRate <= 0)
	{
	throw new ArgumentOutOfRangeException("errorRate", errorRate, string.Format("errorRate must be between 0 and 1, exclusive. Was {0}", errorRate));
	}

	// from overflow in bestM calculation
	if (m < 1)
	{
	throw new ArgumentOutOfRangeException(string.Format("The provided capacity and errorRate values would result in an array of length > int.MaxValue. Please reduce either of these values. Capacity: {0}, Error rate: {1}", capacity, errorRate));
	}

	// set the secondary hash function
	if (hashFunction == null)
	{
	if (typeof(T) == typeof(string))
	{
	this._getHashSecondary = HashString;
	}
	else if (typeof(T) == typeof(int))
	{
	this._getHashSecondary = HashInt32;
	}
	else
	{
	throw new ArgumentNullException("hashFunction", "Please provide a hash function for your type T, when T is not a string or int.");
	}
	}
	else
	{
	this._getHashSecondary = hashFunction;
	}

	this._hashFunctionCount = k;
	this._hashBits = new BitArray(m);
	}

	/// <summary>
	/// A function that can be used to hash input.
	/// </summary>
	/// <param name="input">The values to be hashed.</param>
	/// <returns>The resulting hash code.</returns>
	public delegate int HashFunction(T input);

	/// <summary>
	/// The ratio of false to true bits in the filter. E.g., 1 true bit in a 10 bit filter means a truthiness of 0.1.
	/// </summary>
	public double Truthiness
	{
	get
	{
	return (double)this.TrueBits() / this._hashBits.Count;
	}
	}

	/// <summary>
	/// Adds a new item to the filter. It cannot be removed.
	/// </summary>
	/// <param name="item">The item.</param>
	public void Add(T item)
	{
	// start flipping bits for each hash of item
	int primaryHash = item.GetHashCode();
	int secondaryHash = this._getHashSecondary(item);
	for (int i = 0; i < this._hashFunctionCount; i++)
	{
	int hash = this.ComputeHash(primaryHash, secondaryHash, i);
	this._hashBits[hash] = true;
	}
	}

	/// <summary>
	/// Checks for the existance of the item in the filter for a given probability.
	/// </summary>
	/// <param name="item"> The item. </param>
	/// <returns> The <see cref="bool"/>. </returns>
	public bool Contains(T item)
	{
	int primaryHash = item.GetHashCode();
	int secondaryHash = this._getHashSecondary(item);
	for (int i = 0; i < this._hashFunctionCount; i++)
	{
	int hash = this.ComputeHash(primaryHash, secondaryHash, i);
	if (this._hashBits[hash] == false)
	{
	return false;
	}
	}

	return true;
	}

	/// <summary>
	/// The best k.
	/// </summary>
	/// <param name="capacity"> The capacity. </param>
	/// <param name="errorRate"> The error rate. </param>
	/// <returns> The <see cref="int"/>. </returns>
	private static int BestK(int capacity, float errorRate)
	{
	return (int)Math.Round(Math.Log(2.0) * BestM(capacity, errorRate) / capacity);
	}

	/// <summary>
	/// The best m.
	/// </summary>
	/// <param name="capacity"> The capacity. </param>
	/// <param name="errorRate"> The error rate. </param>
	/// <returns> The <see cref="int"/>. </returns>
	private static int BestM(int capacity, float errorRate)
	{
	return (int)Math.Ceiling(capacity * Math.Log(errorRate, (1.0 / Math.Pow(2, Math.Log(2.0)))));
	}

	/// <summary>
	/// The best error rate.
	/// </summary>
	/// <param name="capacity"> The capacity. </param>
	/// <returns> The <see cref="float"/>. </returns>
	private static float BestErrorRate(int capacity)
	{
	float c = (float)(1.0 / capacity);
	if (c != 0)
	{
	return c;
	}

	// default
	// http://www.cs.princeton.edu/courses/archive/spring02/cs493/lec7.pdf
	return (float)Math.Pow(0.6185, int.MaxValue / capacity);
	}

	/// <summary>
	/// Hashes a 32-bit signed int using Thomas Wang's method v3.1 (http://www.concentric.net/~Ttwang/tech/inthash.htm).
	/// Runtime is suggested to be 11 cycles.
	/// </summary>
	/// <param name="input">The integer to hash.</param>
	/// <returns>The hashed result.</returns>
	private static int HashInt32(T input)
	{
	uint? x = input as uint?;
	unchecked
	{
	x = ~x + (x << 15); // x = (x << 15) - x- 1, as (~x) + y is equivalent to y - x - 1 in two's complement representation
	x = x ^ (x >> 12);
	x = x + (x << 2);
	x = x ^ (x >> 4);
	x = x * 2057; // x = (x + (x << 3)) + (x<< 11);
	x = x ^ (x >> 16);
	return (int)x;
	}
	}

	/// <summary>
	/// Hashes a string using Bob Jenkin's "One At A Time" method from Dr. Dobbs (http://burtleburtle.net/bob/hash/doobs.html).
	/// Runtime is suggested to be 9x+9, where x = input.Length.
	/// </summary>
	/// <param name="input">The string to hash.</param>
	/// <returns>The hashed result.</returns>
	private static int HashString(T input)
	{
	string s = input as string;
	int hash = 0;

	for (int i = 0; i < s.Length; i++)
	{
	hash += s[i];
	hash += (hash << 10);
	hash ^= (hash >> 6);
	}

	hash += (hash << 3);
	hash ^= (hash >> 11);
	hash += (hash << 15);
	return hash;
	}

	/// <summary>
	/// The true bits.
	/// </summary>
	/// <returns> The <see cref="int"/>. </returns>
	private int TrueBits()
	{
	int output = 0;
	foreach (bool bit in this._hashBits)
	{
	if (bit == true)
	{
	output++;
	}
	}

	return output;
	}

	/// <summary>
	/// Performs Dillinger and Manolios double hashing.
	/// </summary>
	/// <param name="primaryHash"> The primary hash. </param>
	/// <param name="secondaryHash"> The secondary hash. </param>
	/// <param name="i"> The i. </param>
	/// <returns> The <see cref="int"/>. </returns>
	private int ComputeHash(int primaryHash, int secondaryHash, int i)
	{
	int resultingHash = (primaryHash + (i * secondaryHash)) % this._hashBits.Count;
	return Math.Abs((int)resultingHash);
	}
	}
	}
	// default usage
	int capacity = 2000000;
	var filter = new Filter<string>(capacity);
	filter.Add("content");
	...
	filter.Contains("content");
	// profit