Created
February 2, 2009 05:04
-
-
Save snej/56785 to your computer and use it in GitHub Desktop.
Implements a Bloom filter in the Scala language.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.collection.BitSet | |
import scala.collection.mutable | |
import scala.runtime.RichString | |
/** Abstract trait for a Bloom filter, a data structure similar to a set, but which requires | |
far less memory (only a few bits per item). The trade-off is that you get false positives: | |
the filter may think it contains items that were never added to it. But for some | |
applications (especially distributed ones) the memory savings are worth it. | |
See: <http://en.wikipedia.org/wiki/Bloom_filter> */ | |
@cloneable | |
trait AbsBloomFilter[T] { | |
/** @return True if the object could have been added to the filter, else false */ | |
def couldContain (elem: T) :Boolean = hashes.forall( hash => _bits(hash(elem).abs % size) ) | |
def apply (elem :T) = couldContain(elem) | |
/** The underlying BitSet storing the data. This can be stored or transmitted. */ | |
val bits :BitSet = _bits | |
override def toString = this.getClass().getName() + "[" + | |
(0 until size).map( i => if (_bits(i)) '1' else '0' ).mkString + "]" | |
protected type BitsType <: BitSet | |
protected var _bits :BitsType | |
protected val hashes :List[T => Int] | |
protected val size = _bits.capacity | |
} | |
/** A concrete immutable Bloom filter. | |
@param hashes A list of hash functions that are used to set and test the bits. | |
@param b The bits storing the filter data. */ | |
class BloomFilter[T] (val hashes :List[T => Int], val b :BitSet) | |
extends {var _bits = b} | |
with AbsBloomFilter[T] { | |
override def clone() = this | |
protected type BitsType = BitSet | |
} | |
/** A concrete mutable Bloom filter. | |
@param hashes A list of hash functions that are used to set and test the bits. | |
@param b The bits storing the filter data. */ | |
class MutableBloomFilter[T] (val hashes :List[T => Int], val b :mutable.BitSet) | |
extends {var _bits = b.clone()} | |
with AbsBloomFilter[T] { | |
/** Creates a new empty Bloom filter. | |
@param hashes A list of hash functions that are used to set and test the bits. | |
@param s The size in bits of the BitSet to use. Larger values take more memory | |
(of course) but produce fewer false positives. */ | |
def this(hashes :List[T => Int], s :Int) = this(hashes, new mutable.BitSet(s)) | |
/** Adds an object to the Bloom filter. */ | |
def += (elem :T) = hashes.foreach( hash => _bits += (hash(elem).abs % size) ) | |
override def clone = new MutableBloomFilter(hashes,_bits) | |
protected type BitsType = mutable.BitSet | |
} | |
/** A useful group of hash functions for use with Bloom filters. | |
Implementations are in Arash Partow's General Hash Function library | |
<http://www.partow.net/programming/hashfunctions> */ | |
object HashFunctions { | |
def RS (str :String) = GeneralHashFunctionLibrary.RSHash(str).asInstanceOf[Int] | |
def PJW (str :String) = GeneralHashFunctionLibrary.PJWHash(str).asInstanceOf[Int] | |
def JS (str :String) = GeneralHashFunctionLibrary.JSHash(str).asInstanceOf[Int] | |
def ELF (str :String) = GeneralHashFunctionLibrary.ELFHash(str).asInstanceOf[Int] | |
def BKDR(str :String) = GeneralHashFunctionLibrary.BKDRHash(str).asInstanceOf[Int] | |
def SDBM(str :String) = GeneralHashFunctionLibrary.SDBMHash(str).asInstanceOf[Int] | |
def DJB (str :String) = GeneralHashFunctionLibrary.DJBHash(str).asInstanceOf[Int] | |
def DEK (str :String) = GeneralHashFunctionLibrary.DEKHash(str).asInstanceOf[Int] | |
def BP (str :String) = GeneralHashFunctionLibrary.BPHash(str).asInstanceOf[Int] | |
def FNV (str :String) = GeneralHashFunctionLibrary.FNVHash(str).asInstanceOf[Int] | |
def AP (str :String) = GeneralHashFunctionLibrary.APHash(str).asInstanceOf[Int] | |
val all : List[String=>Int] = List(RS,PJW,JS,ELF,BKDR,SDBM,DJB,DEK,BP,FNV,AP) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment