Created
December 14, 2011 00:36
-
-
Save tomgibara/1474649 to your computer and use it in GitHub Desktop.
Crinch BloomFilter introduction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This introduction assumes familiarity with the general concept of | |
* Bloom filters. If you don't have this try reading: | |
* | |
* http://en.wikipedia.org/wiki/Bloom_filter | |
*/ | |
// HASHING | |
/** | |
* Any Bloom filter requires a ready supply of hashes. In this | |
* implementation, hashing is a separate abstraction that allows for an | |
* unlimited variety of hash generating algorithms. Different | |
* applications will benefit from different algorithms, but to get | |
* started we can use this: | |
*/ | |
ObjectHash<String> hash = new ObjectHash<String>(); | |
/** | |
* This simply exposes the String.hashCode() method as a Hash, so: | |
*/ | |
assertEqual(hash.hashAsInt("Arnold"), "Arnold".hashCode()); | |
/** | |
* It also helpfully deals with null values by the simple expedient of | |
* hashing them to zero: | |
*/ | |
assertEqual(hash.hashAsInt(null), 0); | |
/** | |
* But for Bloom filters we generally need multiple hashes each entry. | |
* Objects that can generate more than one hash for a single argument | |
* implement the MultHash interface. There's a convenient implementation | |
* of this too: | |
*/ | |
ObjectMultiHash<String> multiHash = new ObjectMultiHash<String>(99); | |
/** | |
* That 99 in the constructor specifies a maximum value for the hash | |
* values it generates. This is an inclusive upper bound so we can be | |
* sure that: | |
*/ | |
assertTrue(multiHash.hashAsInt("Kipper") < 100); | |
assertTrue(multiHash.hashAsInt("Tiger") < 100); | |
assertTrue(multiHash.hashAsInt("Arnold") < 100); | |
/** | |
* This upper bound is important because it provides a convenient way to | |
* generate hashes that cover the number of bits in our Bloom filter. | |
*/ | |
// BLOOM FILTERS | |
/** | |
* There's lots more useful code in the com.tomgibara.crinch.hashing package | |
* to deal with hashing, but we have all we need to create our Bloom filter: | |
*/ | |
BloomFilter<String> filter = new BasicBloomFilter<String>(multiHash, 10); | |
/** | |
* This will create a Bloom filter with a capacity of 100 (equal to the | |
* range of our MultiHash) that will set 10 bits for every String we add | |
* to it. Adding elements is easy: | |
*/ | |
filter.add("Kipper"); | |
/** | |
* You can't remove elements from a Bloom filter, but you can clear it: | |
*/ | |
filter.clear(); | |
/** | |
* Obviously, the filter is empty immediately after it's been cleared: | |
*/ | |
assertTrue( filter.isEmpty() ); | |
/** | |
* Any time you add elements to a bloom filter, a boolean is returned that | |
* indicates whether the filter was modified as a result of the addition. | |
* This is analagous to the way the Java Collections API operates: | |
*/ | |
assertTrue( filter.add("Kipper") ); | |
/** | |
* It's also possible to add multiple elements at once: | |
*/ | |
List<String> kippersFriends = Arrays.asList("Tiger", "Pig"); | |
filter.addAll(kippersFriends); | |
/** | |
* A Bloom filter is no use without the ability to test whether it | |
* contains particular values. This is possible using the mightContain() | |
* method: | |
*/ | |
assertTrue( filter.mightContain("Kipper") ); | |
assertFalse( filter.mightContain("Arnold") ); | |
/** | |
* There's a convenient method for verifying the containment of | |
* collections of elements too: | |
*/ | |
assertTrue( filter.mightContainAll(kippersFriends) ); | |
/** | |
* Of course, the word "might" appears in these methods because Bloom | |
* filters admit false positives, that is, they may claim to contain an | |
* element that was never actually added. So what is the probability of | |
* a false positive for our Bloom filter? It can estimate it for us: | |
*/ | |
filter.getFalsePositiveProbability(); // tiny, about 1 in 10 million | |
/** | |
* But this will increase as we add more elements: | |
*/ | |
List<String> alphabet = Arrays.asList("ABCDEFGHIJKLMNOPQRSTUVWXYZ".split("")); | |
filter.addAll(alphabet); // we add another 26 entries | |
filter.getFalsePositiveProbability(); // now not so tiny, about 1 in 2 | |
/** | |
* Statistics may lie, but probabilities don't; sure enough: | |
*/ | |
assertTrue( filter.mightContain("Tom") ); | |
assertFalse( filter.mightContain("Dick") ); | |
assertFalse( filter.mightContain("Harry") ); | |
assertTrue( filter.mightContain("and all...") ); | |
// COMPATIBLE BLOOM FILTERS | |
/** | |
* In addition to these basic operations, there are some extra things we | |
* can do when we have more than one Bloom filter: | |
*/ | |
BloomFilter<String> otherFilter = new BasicBloomFilter<String>(multiHash, 10); | |
/** | |
* For example we can check if they are equal: | |
*/ | |
assertFalse( otherFilter.equals(filter) ); | |
/** | |
* To take things further we need the notion of two 'compatible' Bloom | |
* filters. Essentially two Bloom filters are compatible when they | |
* generate the same bit patterns. For implementations of BloomFilter | |
* this means that they have the same capacity, the same hash count and | |
* equal multi-hashes. Because our two Bloom filters are compatible we | |
* do this: | |
*/ | |
otherFilter.addAll(filter); | |
/** | |
* This is generally an efficient bitwise operation and the only way of | |
* adding elements from one Bloom filter to another (since it's | |
* impossible to retrieve the elements that were added). We now find | |
* that: | |
*/ | |
assertTrue( otherFilter.equals(filter) ); | |
/** | |
* Two Bloom filters are equal if they are compatible and have the same | |
* bit pattern. Between compatible Bloom filters, we also have a stronger | |
* containment test: | |
*/ | |
assertTrue( otherFilter.containsAll(filter) ); | |
assertTrue( filter.containsAll(otherFilter) ); | |
// BIT VECTORS | |
/** | |
* All BloomFilter implementations are required to expose their bits | |
* via a BitVector: | |
*/ | |
BitVector bits = filter.getBitVector(); | |
/** | |
* This is a very powerful class that can perform a multitude of bit | |
* related operations. Importantly, BitVectors are serializable: | |
*/ | |
ByteArrayOutputStream bytes = new ByteArrayOutputStream(); | |
ObjectOutputStream out = new ObjectOutputStream(bytes); | |
out.writeObject(bits); | |
out.close(); | |
ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())); | |
BitVector otherBits = (BitVector) in.readObject(); | |
/** | |
* The persisted bits can be used to reconstruct an equal filter: | |
*/ | |
otherFilter = new BasicBloomFilter<String>(otherBits.mutable(), multiHash, 10); | |
assertEqual(otherFilter, filter); | |
/** | |
* Care must be taken to ensure that all the other parameters of | |
* the BloomFilter remain consistent after deserialization, specifically | |
* the multi-hash and the hash count. | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment