Skip to content

Instantly share code, notes, and snippets.

@adamretter
Created May 15, 2014 19:29
Show Gist options
  • Save adamretter/dba53d7f3c881a3b4f3a to your computer and use it in GitHub Desktop.
Save adamretter/dba53d7f3c881a3b4f3a to your computer and use it in GitHub Desktop.
package wlhn
import com.twitter.hashing.KeyHasher
object BloomApp extends App {
/**
* A bloom filter is used to test whether an element is a member of a set.
*
* False positive matches are possible,
* but false negatives are not;
* i.e. a query returns either "possibly in set" or "definitely not in set"
*
* Elements can be added to the set,
* but not removed (though this can be addressed with a "counting" filter).
* The more elements that are added to the set, the larger the probability of false positives.
*/
val k = 11 //optimal value of K is bits per item (16 for us) * 0.7
val m = 0x10000
val storage = new Array[Byte](m)
type ByteProvider = {
def getBytes(): Array[Byte]
}
/**
* k hashes for an element
*/
def hash[T <: ByteProvider](element: T) : Seq[Int] = {
for(i <- (1 to k)) yield hashFunction(element, i)
}
/**
* The hash function when given an element
* returns a position in an array
*/
def hashFunction[T <: ByteProvider](element: T, shift: Int) : Int = {
val hash = KeyHasher.FNV1_32.hashKey(element.getBytes)
val unique = hash >> shift
val limited = unique & (m - 1)
limited.asInstanceOf[Int]
}
def store(indexes: Seq[Int]) = {
for(index <- indexes) {
storage(index) = 1
}
}
def maybeExists[T <: ByteProvider](element: T): Boolean = {
val indexes = hash(element)
val maybeStoredElement = for(index <- indexes) yield storage(index)
!maybeStoredElement.contains(0)
}
def fileToTokens(f: String) = scala.io.Source.fromFile(f).mkString.split("""\s+""")
//store file 1 stuff
for(file1Token <- fileToTokens("/tmp/file1.txt")) {
store(hash(file1Token))
}
//compare with file2 stuff
for(file2Token <- fileToTokens("/tmp/file2.txt")) {
println(s"Token '$file2Token' exists = ${maybeExists(file2Token)}")
}
//
// val helloIndexes = hash("hello world")
// store(helloIndexes)
//
// val goodbyeIndexes = hash("goodbye world")
// store(goodbyeIndexes)
//
// //does it exist?
// println("1=" + maybeExists("hello world"))
// println("2=" + maybeExists("goodbye world"))
// println("3=" + maybeExists("hello"))
// println("4=" + maybeExists("world"))
// println("5=" + maybeExists("goodbye world!"))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment