Skip to content

Instantly share code, notes, and snippets.

@sergkh
Created June 10, 2024 07:01
Show Gist options
  • Save sergkh/59ca75698020d612a5a36300f83eb614 to your computer and use it in GitHub Desktop.
Save sergkh/59ca75698020d612a5a36300f83eb614 to your computer and use it in GitHub Desktop.
//> using dep "io.github.fastfilter:fastfilter:1.0.2"
//> using dep "com.duprasville.guava:guava-probably:1.0"
// requires manual maven installation of xor-filter library:
//> using jar ~/.m2/repository/me/k11i/xor-filter/0.1.2-SNAPSHOT/xor-filter-0.1.2-SNAPSHOT.jar
//> using sourceJar ~/.m2/repository/me/k11i/xor-filter/0.1.2-SNAPSHOT/xor-filter-0.1.2-SNAPSHOT-sources.jar
import org.fastfilter.bloom.Bloom
import org.fastfilter.bloom.count.SuccinctCountingBloom
import scala.collection.mutable.Buffer
import org.fastfilter.cuckoo._
import org.fastfilter.xor._
import org.fastfilter.xorplus._
import scala.concurrent.duration.FiniteDuration
import com.duprasville.guava.probably.ProbabilisticFilter
import org.fastfilter.bloom.count.CountingBloom
import me.k11i.xorfilter.XorFilter
import java.util.UUID
import com.duprasville.guava.probably.CuckooFilter
import com.google.common.hash.Funnels
import java.io.ByteArrayOutputStream
import com.google.common.hash.BloomFilter
import scala.collection.mutable.HashSet
import java.io.ByteArrayInputStream
import org.fastfilter._
import scala.compiletime.uninitialized
import scala.concurrent.duration._
val maxSize = 5000L
val elementsToInsert = 5000
val tries = 100000
println("Capacity of the filter: " + maxSize)
println("Elements: " + elementsToInsert)
trait FilterOps {
def fill(x: Seq[String]): Unit
def contains(x: String): Boolean
def remove(x: String): Unit
def size: Int
}
class SetFilterOps extends FilterOps {
private var filter: HashSet[String] = new HashSet[String]()
def fill(s: Seq[String]): Unit = s.foreach(filter.add)
def contains(s: String): Boolean = filter.contains(s)
def remove(s: String): Unit = filter.remove(s)
def size: Int = filter.size * filter.headOption.map(_.length).getOrElse(1)
}
def fastFilterOps(t: FilterType, bits: Int = 8): FilterOps = new FilterOps {
private var filter: Filter = uninitialized
def fill(s: Seq[String]): Unit = { filter = t.construct(s.map(hash).toArray, bits) }
def contains(s: String): Boolean = filter.mayContain(hash(s))
def remove(s: String): Unit = filter.remove(hash(s))
def size: Int = ((filter match {
case _: Bloom => filter.asInstanceOf[Bloom].getBitCount()
case _: CountingBloom => filter.asInstanceOf[CountingBloom].getBitCount()
case _: XorPlus8 => filter.asInstanceOf[XorPlus8].getBitCount()
case _: XorFuse8 => filter.asInstanceOf[XorFuse8].getBitCount()
case _: Xor8 => filter.asInstanceOf[Xor8].getBitCount()
case _: Xor16 => filter.asInstanceOf[Xor16].getBitCount()
case _: XorSimple => filter.asInstanceOf[XorSimple].getBitCount()
case _: Cuckoo8 => filter.asInstanceOf[Cuckoo8].getBitCount()
case _: Cuckoo16 => filter.asInstanceOf[Cuckoo16].getBitCount()
case _: CuckooPlus8 => filter.asInstanceOf[CuckooPlus8].getBitCount()
case _: CuckooPlus16 => filter.asInstanceOf[CuckooPlus16].getBitCount()
case _: SuccinctCountingBloom => filter.asInstanceOf[SuccinctCountingBloom].getBitCount()
case x => println(s"Not defined for filter: $x"); 0
}) / 8).toInt
private def hash(s: String): Long = {
val seed = 0x9747b28c
val len = s.length()
var hash = 0xD45E69F901E72147L ^ seed
for(i <- 0 until len)
hash = 0x3631754B22FF2D5CL * (i + s.charAt(i)) ^ (hash << 2) ^ (hash >>> 2)
hash
}
}
class ProbabilisticFilterFilterOps(filter: ProbabilisticFilter[String]) extends FilterOps {
def fill(s: Seq[String]): Unit = s.foreach(filter.add)
def contains(s: String): Boolean = filter.contains(s)
def remove(s: String): Unit = filter.remove(s)
def size: Int = {
val out = new ByteArrayOutputStream(128)
filter match {
case _: CuckooFilter[_] => filter.asInstanceOf[CuckooFilter[?]].writeTo(out)
case _ => ()
}
out.size()
}
}
class BloomFilterFilterOps extends FilterOps {
private var f: BloomFilter[String] = uninitialized
def fill(s: Seq[String]): Unit = {
f = BloomFilter.create(Funnels.unencodedCharsFunnel(), s.size)
s.foreach(f.put)
}
def contains(s: String): Boolean = f.mightContain(s)
def remove(s: String): Unit = ()
def size: Int = {
val out = new ByteArrayOutputStream(128)
f match {
case _: BloomFilter[_] => f.asInstanceOf[BloomFilter[?]].writeTo(out)
}
out.size()
}
}
def tokenId = UUID.randomUUID().toString.replaceAll("-", "")
val tokensSet = (0 until elementsToInsert).map(_ => tokenId)
val toRemoveSet = tokensSet.grouped(2).map(_.head)
def runTests(filter: FilterOps, filterName: String): Unit = {
val removeSet = new HashSet[String]()
println(s"\n\n===\nRunning tests for $filterName")
filter.fill(tokensSet)
println(s"Size in bytes: ${filter.size}")
var falsePositives = 0
for (_ <- 0 until tries) {
val id = tokenId
if (filter.contains(id)) {
falsePositives += 1
}
}
println(s"False positives: $falsePositives (${(falsePositives.toDouble / tries) * 100}%)")
}
case class SimulationResult(
filterName: String,
falsePositives: Int,
falsePositivesRate: Double,
falsePositivesRequests: Int,
falsePositivesTraffic: Int,
traffic: Int,
memoryFootprint: Int,
timeToFill: Long,
timeToSearch: Long
)
def runAuthSimulation(
filter: FilterOps,
filterName: String,
users: Int = 10000,
sessionLength: FiniteDuration = 1.hour,
blocksPercentage: Double = 0.35,
requestsPerSession: Int = 100,
updateTime: FiniteDuration = 5.minutes,
noPrint: Boolean = false
): SimulationResult = {
if(!noPrint) println(s"\n\n===\nRunning auth simulation for $filterName")
val authTransfersPerSession = sessionLength / updateTime
val totalBlockedUsers = (users * blocksPercentage).toInt
val totalTokens = (0 until users).map(_ => tokenId)
val (validTokens, blockedTokens) = totalTokens.splitAt(totalBlockedUsers)
val start = System.nanoTime()
filter.fill(blockedTokens)
val end = System.nanoTime()
val memoryFootprint = filter.size
val traffic = ((sessionLength / updateTime) * memoryFootprint).toInt
val startSearch = System.nanoTime()
val falsePositives = validTokens.count(token => filter.contains(token))
val endSearch = System.nanoTime()
val falsePositivesRate = (falsePositives.toDouble / totalBlockedUsers) * 100
val falsePositivesRequests = falsePositives * requestsPerSession
val falsePositivesTraffic = falsePositivesRequests * 20 // taking 20 bytes on average for a forbidden request
if (!noPrint) {
println(s"""Report for $filterName:
| Total/blocked users: $users / $totalBlockedUsers
| False positives: $falsePositives ($falsePositivesRate%)
| False positives requests: $falsePositivesRequests
| False positives traffic: $falsePositivesTraffic bytes
| Auth traffic: $traffic bytes (${(sessionLength / updateTime)} requests)
| Memory footprint: $memoryFootprint bytes
| Time to fill: ${end - start} ms
| Time to search: ${endSearch - startSearch} ms
|""".stripMargin)
}
SimulationResult(
filterName,
falsePositives,
falsePositivesRate,
falsePositivesRequests,
falsePositivesTraffic,
traffic,
memoryFootprint,
end - start,
endSearch - startSearch
)
}
def runAllSimulations(
users: Int = 10000,
sessionLength: FiniteDuration = 1.day,
blocksPercentage: Double = 0.25,
requestsPerSession: Int = 100,
updateTime: FiniteDuration = 5.minutes
): Unit = {
var results = Buffer.empty[SimulationResult]
results += runAuthSimulation(new SetFilterOps, "Set", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
// results += runAuthSimulation(new BloomFilterFilterOps, "Bloom filter", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
// results += runAuthSimulation(new ProbabilisticFilterFilterOps(CuckooFilter.create(Funnels.unencodedCharsFunnel(), maxSize)), "Cuckoo", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.BLOOM), "Bloom 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.BLOOM, 16), "Bloom 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.COUNTING_BLOOM), "Counting bloom", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_8), "Cuckoo 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_16), "Cuckoo 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_PLUS_8), "Cuckoo plus 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_PLUS_16), "Cuckoo plus 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.XOR_8), "XOR 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.XOR_16), "XOR 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
results += runAuthSimulation(fastFilterOps(FilterType.XOR_PLUS_8), "XOR plus 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
// results += runAuthSimulation(fastFilterOps(FilterType.XOR_SIMPLE), "XOR simple fast", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true)
println(s"\n\n===\nRunning all simulations for users: $users, session length: $sessionLength, blocks percentage: $blocksPercentage, requests per session: $requestsPerSession, update time: $updateTime")
println("Name \tFP\tFP Rate\tFP Requests\tFPTraffic\tTraffic Total\tMemory footprint\tTime to fill\tTime to query")
results.foreach(r =>
println(f""""${r.filterName}%30s"\t${r.falsePositives}\t${r.falsePositivesRate}%.2f%%\t ${r.falsePositivesRequests.toString}%6s\t${r.falsePositivesTraffic.toString}%10s\t${r.traffic.toString}%10s\t${r.memoryFootprint.toString}%16s\t${r.timeToFill.toString}%12s\t${r.timeToSearch.toString}%12s""")
)
val smallestFP = results.filterNot(_.filterName == "Set").minBy(_.falsePositives)
println(s"Smallest FP: ${smallestFP.filterName} (${smallestFP.falsePositives} bytes)")
val smallestMemory = results.minBy(_.memoryFootprint)
println(s"Smallest memory footprint: ${smallestMemory.filterName} (${smallestMemory.memoryFootprint} bytes)")
}
runAllSimulations(100)
runAllSimulations(100, blocksPercentage = 0.05)
runAllSimulations(100, blocksPercentage = 0.10)
// runAllSimulations(100, 6.hours)
// runAllSimulations(100, 2.days)
runAllSimulations(1000)
runAllSimulations(1000, blocksPercentage = 0.05)
runAllSimulations(1000, blocksPercentage = 0.10)
// runAllSimulations(1000, 6.hours)
// runAllSimulations(1000, 2.days)
runAllSimulations(10000)
runAllSimulations(10000, blocksPercentage = 0.05)
runAllSimulations(10000, blocksPercentage = 0.10)
runAllSimulations(10000, 6.hours)
runAllSimulations(10000, 2.days)
runAllSimulations(100000)
runAllSimulations(100000, blocksPercentage = 0.05)
runAllSimulations(100000, blocksPercentage = 0.10)
runAllSimulations(100000, 6.hours)
runAllSimulations(100000, 2.days)
runAllSimulations(1000000)
runAllSimulations(1000000, blocksPercentage = 0.05)
runAllSimulations(1000000, blocksPercentage = 0.10)
runAllSimulations(1000000, 6.hours)
runAllSimulations(1000000, 2.days)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment