Created
May 24, 2011 19:27
-
-
Save llimllib/989472 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This is from http://code.google.com/p/concurrentlinkedhashmap/wiki/BloomFilter, I just copied it here so I can link to a line of it */ | |
import static java.lang.Long.bitCount; | |
import static java.lang.Long.toBinaryString; | |
import static java.lang.Math.abs; | |
import static java.lang.Math.ceil; | |
import static java.lang.Math.exp; | |
import static java.lang.Math.log; | |
import static java.lang.Math.max; | |
import static java.lang.Math.pow; | |
import static java.util.Arrays.copyOf; | |
import java.util.AbstractSet; | |
import java.util.Arrays; | |
import java.util.Iterator; | |
import java.util.Set; | |
import java.util.concurrent.atomic.AtomicStampedReference; | |
import java.util.concurrent.locks.Lock; | |
import java.util.concurrent.locks.ReentrantLock; | |
/** | |
* A thread-safe Bloom filter implementation that uses a restricted version of the | |
* {@link Set} interface. All mutative operations are implemented by making a fresh | |
* copy of the underlying array. The set does not support traversal or removal | |
* operations and may report false positives on membership queries. | |
* <p> | |
* The copy-on-write model is ordinarily too costly, but due to the high read/write | |
* ratio common to Bloom filters this may be <em>more</em> efficient than alternatives | |
* when membership operations vastly out number mutations. It is useful when you cannot | |
* or do not want to synchronize membership queries, yet need to preclude interference | |
* among concurrent threads. | |
* <p> | |
* A Bloom filter is a space and time efficient probabilistic data structure that is used | |
* to test whether an element is a member of a set. False positives are possible, but false | |
* negatives are not. Elements can be added to the set, but not removed. The more elements | |
* that are added to the set the larger the probability of false positives. While risking | |
* false positives, Bloom filters have a space advantage over other data structures for | |
* representing sets by not storing the data items. | |
* | |
* @author <a href="mailto:[email protected]">Ben Manes</a> | |
*/ | |
public final class BloomFilter<E> extends AbstractSet<E> { | |
private final AtomicStampedReference<long[]> ref; | |
private final float probability; | |
private final int capacity; | |
private final int length; | |
private final int hashes; | |
private final Lock lock; | |
private final int bits; | |
/** | |
* Creates a Bloom filter that can store up to an expected maximum capacity with an acceptable probability | |
* that a membership query will result in a false positive. The filter will size itself based on the given | |
* parameters. | |
* | |
* @param capacity The expected maximum number of elements to be inserted into the Bloom filter. | |
* @param probability The acceptable false positive probability for membership queries. | |
*/ | |
public BloomFilter(int capacity, float probability) { | |
if ((capacity <= 0) || (probability <= 0) || (probability >= 1)) { | |
throw new IllegalArgumentException(); | |
} | |
this.capacity = max(capacity, Long.SIZE); | |
this.bits = bits(capacity, probability); | |
this.length = bits / Long.SIZE; | |
this.lock = new ReentrantLock(); | |
this.hashes = numberOfHashes(capacity, bits); | |
this.probability = probability(hashes, capacity, bits); | |
this.ref = new AtomicStampedReference<long[]>(new long[length], 0); | |
} | |
/** | |
* Calculates the <tt>false positive probability</tt> of the {@link #contains(Object)} | |
* method returning <tt>true</tt> for an object that had not been inserted into the | |
* Bloom filter. | |
* | |
* @param hashes The number of hashing algorithms applied to an element. | |
* @param capacity The estimated number of elements to be inserted into the Bloom filter. | |
* @param bits The number of bits that can be used for storing membership. | |
* @return The estimated false positive probability. | |
*/ | |
private static float probability(int hashes, int capacity, int bits) { | |
return (float) pow((1 - exp(-hashes * ((double) capacity / bits))), hashes); | |
} | |
/** | |
* Calculates the optimal number of hashing algorithms, k, at a given sizing to | |
* minimize the probability. | |
* | |
* @param capacity The estimated number of elements to be inserted into the Bloom filter. | |
* @param bits The number of bits that can be used for storing membership. | |
* @return The optimal number of hashing functions. | |
*/ | |
private static int numberOfHashes(int capacity, int bits) { | |
return (int) ceil((bits / capacity) * log(2)); | |
} | |
/** | |
* Calculates the required number of bits assuming an optimal number of hashing | |
* algorithms are available. The optimal value is normalized to the closest word. | |
* | |
* @param capacity The estimated number of elements to be inserted into the Bloom filter. | |
* @param probability The desired false positive probability of a membership query. | |
* @return The required number of storage bits. | |
*/ | |
private static int bits(int capacity, float probability) { | |
int optimal = (int) ceil(abs(capacity * log(probability) / pow(log(2), 2))); | |
int offset = Long.SIZE - (optimal % Long.SIZE); | |
return (offset == 0) ? optimal : (optimal + offset); | |
} | |
/** | |
* Returns the <tt>false positive probability</tt> of the {@link #contains(Object)} | |
* method returning <tt>true</tt> for an object that had not been inserted into the | |
* Bloom filter. | |
* | |
* @return The false positive probability for membership queries. | |
*/ | |
public float probability() { | |
return probability; | |
} | |
/** | |
* Returns the expected maximum number of elements that may be inserted into the Bloom filter. | |
* The Bloom filter's space efficiency has been optimized to support this capacity with the | |
* false positive probability specified by {@link #probability()}. | |
* | |
* @return The expected maximum number of elements to be inserted into the Bloom filter. | |
*/ | |
public int capacity() { | |
return capacity; | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public int size() { | |
return ref.getStamp(); | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public void clear() { | |
lock.lock(); | |
try { | |
ref.set(new long[length], 0); | |
} finally { | |
lock.unlock(); | |
} | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public boolean contains(Object o) { | |
int[] size = new int[1]; | |
long[] words = ref.get(size); | |
if (size[0] == 0) { | |
return false; | |
} | |
int[] indexes = indexes(o); | |
for (int index : indexes) { | |
if (!getAt(index, words)) { | |
return false; | |
} | |
} | |
return true; | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public boolean add(E o) { | |
int[] indexes = indexes(o); | |
boolean added = false; | |
lock.lock(); | |
try { | |
int[] size = new int[1]; | |
long[] words = copyOf(ref.get(size), length); | |
for (int index : indexes) { | |
added |= setAt(index, words); | |
} | |
if (added) { | |
ref.set(words, ++size[0]); | |
} | |
} finally { | |
lock.unlock(); | |
} | |
return added; | |
} | |
/** | |
* Retrieves the flag stored at the index location in the given array. | |
* | |
* @param index The bit location of the flag. | |
* @param words The array to lookup in. | |
* @return The flag's value. | |
*/ | |
private boolean getAt(int index, long[] words) { | |
int i = index/bits; | |
int bitIndex = index % bits; | |
return (words[i] & (1L << bitIndex)) != 0; | |
} | |
/** | |
* Sets the flag stored at the index location in the given array. | |
* | |
* @param index The bit location of the flag. | |
* @param words The array to update. | |
* @return If updated. | |
*/ | |
private boolean setAt(int index, long[] words) { | |
int i = index/bits; | |
int bitIndex = index % bits; | |
long mask = (1L << bitIndex); | |
if ((words[i] & mask) == 0) { | |
words[i] |= mask; | |
return true; | |
} | |
return false; | |
} | |
/** | |
* Calculates the index position for the object. | |
* | |
* @param o The object. | |
* @return The index to the bit. | |
*/ | |
private int[] indexes(Object o) { | |
// Double hashing allows calculating multiple index locations | |
int hashCode = o.hashCode(); | |
int probe = 1 + abs(hashCode % length); | |
int[] indexes = new int[hashes]; | |
int h = hash(hashCode); | |
for (int i=0; i<hashes; i++) { | |
indexes[i] = abs(h ^ i*probe) % bits; | |
} | |
return indexes; | |
} | |
/** | |
* Doug Lea's universal hashing algorithm used in the collection libraries. | |
*/ | |
private int hash(int hashCode) { | |
// Spread bits using variant of single-word Wang/Jenkins hash | |
hashCode += (hashCode << 15) ^ 0xffffcd7d; | |
hashCode ^= (hashCode >>> 10); | |
hashCode += (hashCode << 3); | |
hashCode ^= (hashCode >>> 6); | |
hashCode += (hashCode << 2) + (hashCode << 14); | |
return hashCode ^ (hashCode >>> 16); | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public Iterator<E> iterator() { | |
throw new UnsupportedOperationException(); | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public boolean equals(Object o) { | |
if (o == this) { | |
return true; | |
} else if (!(o instanceof BloomFilter<?>)) { | |
return false; | |
} | |
BloomFilter<?> filter = (BloomFilter<?>) o; | |
int[] size1 = new int[1]; | |
int[] size2 = new int[1]; | |
long[] words1 = ref.get(size1); | |
long[] words2 = filter.ref.get(size2); | |
return (size1[0] == size2[0]) && Arrays.equals(words1, words2); | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public int hashCode() { | |
return Arrays.hashCode(ref.getReference()); | |
} | |
/** | |
* {@inheritDoc} | |
*/ | |
@Override | |
public String toString() { | |
int size[] = new int[1]; | |
long[] words = ref.get(size); | |
StringBuilder builder = new StringBuilder("{"); | |
builder.append("probability=").append(probability).append(", ") | |
.append("hashes=").append(hashes).append(", ") | |
.append("capacity=").append(capacity).append(", ") | |
.append("size=").append(size[0]).append(", ") | |
.append("bits=").append(bits).append(", ") | |
.append("bit-count=").append(toBitCount(words)).append(", ") | |
.append("value=").append(toBinaryArrayString(words)) | |
.append('}'); | |
return builder.toString(); | |
} | |
/** | |
* Calculates the population count, the number of one-bits, in the words. | |
* | |
* @param words The consistent view of the data. | |
* @return The number of one-bits in the two's complement binary representation. | |
*/ | |
private int toBitCount(long[] words) { | |
int population = 0; | |
for (long word : words) { | |
population += bitCount(word); | |
} | |
return population; | |
} | |
/** | |
* Creates a pretty-printed binary string representation of the data. | |
* | |
* @param words The consistent view of the data. | |
* @return A binary string representation. | |
*/ | |
private String toBinaryArrayString(long[] words) { | |
StringBuilder builder = new StringBuilder(words.length); | |
boolean trim = true; | |
for (int i=words.length-1; i>=0; i--) { | |
long word = words[i]; | |
if ((word == 0) && trim) { | |
continue; | |
} | |
builder.append(toBinaryString(word)); | |
trim = false; | |
} | |
if (trim) { | |
builder.append('0'); | |
} | |
return builder.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment