Created
May 7, 2011 03:14
-
-
Save evanlh/960165 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package katas.kata5; | |
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.Reader; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.BitSet; | |
public class BloomFilter { | |
private static final int BITS = 19; | |
private static final int NBITS = 1 << BITS; | |
private static final int MASK = NBITS - 1; | |
private final BitSet bits; | |
private int wordsTrained = 0; | |
public BloomFilter() { | |
bits = new BitSet(NBITS); | |
} | |
public void train(Reader in) throws IOException { | |
BufferedReader r = new BufferedReader(in); | |
String line; | |
while (null != (line = r.readLine())) { | |
trainWord(line.trim()); | |
++wordsTrained; | |
} | |
System.out.println(bits.cardinality()); | |
System.out.println((bits.cardinality() * 100.0) / NBITS); | |
} | |
public int getWordsTrained() { | |
return wordsTrained; | |
} | |
public boolean has(String word) { | |
return check(hashes(word)); | |
} | |
public void trainWord(String word) { | |
insert(hashes(word)); | |
} | |
private boolean check(int[] hashes) { | |
for (int hash : hashes) { | |
if (!bits.get(hash)) { | |
return false; | |
} | |
} | |
return true; | |
} | |
private int[] hashes(String word) { | |
byte[] digest = digest(word); | |
int[] hashes = new int[digest.length / ((BITS + 7) / 8)]; | |
int i = 0; | |
int currentHash = 0, currentBits = 0; | |
for (int j = 0; j < digest.length; j++) { | |
if (currentBits >= BITS) { | |
hashes[i++] = currentHash & MASK; | |
currentHash = 0; | |
currentBits = 0; | |
} | |
int b = digest[j]; | |
currentHash = currentHash << 8 | (b & 0xFF); | |
currentBits += 8; | |
} | |
return hashes; | |
} | |
private void insert(int[] hashes) { | |
for (int hash : hashes) { | |
bits.set(hash); | |
} | |
} | |
private byte[] digest(String word) { | |
try { | |
return MessageDigest.getInstance("MD5").digest(word.getBytes()); | |
} catch (NoSuchAlgorithmException e) { | |
throw new RuntimeException(e); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment