Created
May 15, 2014 19:29
-
-
Save adamretter/dba53d7f3c881a3b4f3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package wlhn | |
import com.twitter.hashing.KeyHasher | |
object BloomApp extends App { | |
/** | |
* A bloom filter is used to test whether an element is a member of a set. | |
* | |
* False positive matches are possible, | |
* but false negatives are not; | |
* i.e. a query returns either "possibly in set" or "definitely not in set" | |
* | |
* Elements can be added to the set, | |
* but not removed (though this can be addressed with a "counting" filter). | |
* The more elements that are added to the set, the larger the probability of false positives. | |
*/ | |
val k = 11 //optimal value of K is bits per item (16 for us) * 0.7 | |
val m = 0x10000 | |
val storage = new Array[Byte](m) | |
type ByteProvider = { | |
def getBytes(): Array[Byte] | |
} | |
/** | |
* k hashes for an element | |
*/ | |
def hash[T <: ByteProvider](element: T) : Seq[Int] = { | |
for(i <- (1 to k)) yield hashFunction(element, i) | |
} | |
/** | |
* The hash function when given an element | |
* returns a position in an array | |
*/ | |
def hashFunction[T <: ByteProvider](element: T, shift: Int) : Int = { | |
val hash = KeyHasher.FNV1_32.hashKey(element.getBytes) | |
val unique = hash >> shift | |
val limited = unique & (m - 1) | |
limited.asInstanceOf[Int] | |
} | |
def store(indexes: Seq[Int]) = { | |
for(index <- indexes) { | |
storage(index) = 1 | |
} | |
} | |
def maybeExists[T <: ByteProvider](element: T): Boolean = { | |
val indexes = hash(element) | |
val maybeStoredElement = for(index <- indexes) yield storage(index) | |
!maybeStoredElement.contains(0) | |
} | |
def fileToTokens(f: String) = scala.io.Source.fromFile(f).mkString.split("""\s+""") | |
//store file 1 stuff | |
for(file1Token <- fileToTokens("/tmp/file1.txt")) { | |
store(hash(file1Token)) | |
} | |
//compare with file2 stuff | |
for(file2Token <- fileToTokens("/tmp/file2.txt")) { | |
println(s"Token '$file2Token' exists = ${maybeExists(file2Token)}") | |
} | |
// | |
// val helloIndexes = hash("hello world") | |
// store(helloIndexes) | |
// | |
// val goodbyeIndexes = hash("goodbye world") | |
// store(goodbyeIndexes) | |
// | |
// //does it exist? | |
// println("1=" + maybeExists("hello world")) | |
// println("2=" + maybeExists("goodbye world")) | |
// println("3=" + maybeExists("hello")) | |
// println("4=" + maybeExists("world")) | |
// println("5=" + maybeExists("goodbye world!")) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment