Created
June 10, 2024 07:01
-
-
Save sergkh/59ca75698020d612a5a36300f83eb614 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//> using dep "io.github.fastfilter:fastfilter:1.0.2" | |
//> using dep "com.duprasville.guava:guava-probably:1.0" | |
// requires manual maven installation of xor-filter library: | |
//> using jar ~/.m2/repository/me/k11i/xor-filter/0.1.2-SNAPSHOT/xor-filter-0.1.2-SNAPSHOT.jar | |
//> using sourceJar ~/.m2/repository/me/k11i/xor-filter/0.1.2-SNAPSHOT/xor-filter-0.1.2-SNAPSHOT-sources.jar | |
import org.fastfilter.bloom.Bloom | |
import org.fastfilter.bloom.count.SuccinctCountingBloom | |
import scala.collection.mutable.Buffer | |
import org.fastfilter.cuckoo._ | |
import org.fastfilter.xor._ | |
import org.fastfilter.xorplus._ | |
import scala.concurrent.duration.FiniteDuration | |
import com.duprasville.guava.probably.ProbabilisticFilter | |
import org.fastfilter.bloom.count.CountingBloom | |
import me.k11i.xorfilter.XorFilter | |
import java.util.UUID | |
import com.duprasville.guava.probably.CuckooFilter | |
import com.google.common.hash.Funnels | |
import java.io.ByteArrayOutputStream | |
import com.google.common.hash.BloomFilter | |
import scala.collection.mutable.HashSet | |
import java.io.ByteArrayInputStream | |
import org.fastfilter._ | |
import scala.compiletime.uninitialized | |
import scala.concurrent.duration._ | |
val maxSize = 5000L | |
val elementsToInsert = 5000 | |
val tries = 100000 | |
println("Capacity of the filter: " + maxSize) | |
println("Elements: " + elementsToInsert) | |
trait FilterOps { | |
def fill(x: Seq[String]): Unit | |
def contains(x: String): Boolean | |
def remove(x: String): Unit | |
def size: Int | |
} | |
class SetFilterOps extends FilterOps { | |
private var filter: HashSet[String] = new HashSet[String]() | |
def fill(s: Seq[String]): Unit = s.foreach(filter.add) | |
def contains(s: String): Boolean = filter.contains(s) | |
def remove(s: String): Unit = filter.remove(s) | |
def size: Int = filter.size * filter.headOption.map(_.length).getOrElse(1) | |
} | |
def fastFilterOps(t: FilterType, bits: Int = 8): FilterOps = new FilterOps { | |
private var filter: Filter = uninitialized | |
def fill(s: Seq[String]): Unit = { filter = t.construct(s.map(hash).toArray, bits) } | |
def contains(s: String): Boolean = filter.mayContain(hash(s)) | |
def remove(s: String): Unit = filter.remove(hash(s)) | |
def size: Int = ((filter match { | |
case _: Bloom => filter.asInstanceOf[Bloom].getBitCount() | |
case _: CountingBloom => filter.asInstanceOf[CountingBloom].getBitCount() | |
case _: XorPlus8 => filter.asInstanceOf[XorPlus8].getBitCount() | |
case _: XorFuse8 => filter.asInstanceOf[XorFuse8].getBitCount() | |
case _: Xor8 => filter.asInstanceOf[Xor8].getBitCount() | |
case _: Xor16 => filter.asInstanceOf[Xor16].getBitCount() | |
case _: XorSimple => filter.asInstanceOf[XorSimple].getBitCount() | |
case _: Cuckoo8 => filter.asInstanceOf[Cuckoo8].getBitCount() | |
case _: Cuckoo16 => filter.asInstanceOf[Cuckoo16].getBitCount() | |
case _: CuckooPlus8 => filter.asInstanceOf[CuckooPlus8].getBitCount() | |
case _: CuckooPlus16 => filter.asInstanceOf[CuckooPlus16].getBitCount() | |
case _: SuccinctCountingBloom => filter.asInstanceOf[SuccinctCountingBloom].getBitCount() | |
case x => println(s"Not defined for filter: $x"); 0 | |
}) / 8).toInt | |
private def hash(s: String): Long = { | |
val seed = 0x9747b28c | |
val len = s.length() | |
var hash = 0xD45E69F901E72147L ^ seed | |
for(i <- 0 until len) | |
hash = 0x3631754B22FF2D5CL * (i + s.charAt(i)) ^ (hash << 2) ^ (hash >>> 2) | |
hash | |
} | |
} | |
class ProbabilisticFilterFilterOps(filter: ProbabilisticFilter[String]) extends FilterOps { | |
def fill(s: Seq[String]): Unit = s.foreach(filter.add) | |
def contains(s: String): Boolean = filter.contains(s) | |
def remove(s: String): Unit = filter.remove(s) | |
def size: Int = { | |
val out = new ByteArrayOutputStream(128) | |
filter match { | |
case _: CuckooFilter[_] => filter.asInstanceOf[CuckooFilter[?]].writeTo(out) | |
case _ => () | |
} | |
out.size() | |
} | |
} | |
class BloomFilterFilterOps extends FilterOps { | |
private var f: BloomFilter[String] = uninitialized | |
def fill(s: Seq[String]): Unit = { | |
f = BloomFilter.create(Funnels.unencodedCharsFunnel(), s.size) | |
s.foreach(f.put) | |
} | |
def contains(s: String): Boolean = f.mightContain(s) | |
def remove(s: String): Unit = () | |
def size: Int = { | |
val out = new ByteArrayOutputStream(128) | |
f match { | |
case _: BloomFilter[_] => f.asInstanceOf[BloomFilter[?]].writeTo(out) | |
} | |
out.size() | |
} | |
} | |
def tokenId = UUID.randomUUID().toString.replaceAll("-", "") | |
val tokensSet = (0 until elementsToInsert).map(_ => tokenId) | |
val toRemoveSet = tokensSet.grouped(2).map(_.head) | |
def runTests(filter: FilterOps, filterName: String): Unit = { | |
val removeSet = new HashSet[String]() | |
println(s"\n\n===\nRunning tests for $filterName") | |
filter.fill(tokensSet) | |
println(s"Size in bytes: ${filter.size}") | |
var falsePositives = 0 | |
for (_ <- 0 until tries) { | |
val id = tokenId | |
if (filter.contains(id)) { | |
falsePositives += 1 | |
} | |
} | |
println(s"False positives: $falsePositives (${(falsePositives.toDouble / tries) * 100}%)") | |
} | |
case class SimulationResult( | |
filterName: String, | |
falsePositives: Int, | |
falsePositivesRate: Double, | |
falsePositivesRequests: Int, | |
falsePositivesTraffic: Int, | |
traffic: Int, | |
memoryFootprint: Int, | |
timeToFill: Long, | |
timeToSearch: Long | |
) | |
def runAuthSimulation( | |
filter: FilterOps, | |
filterName: String, | |
users: Int = 10000, | |
sessionLength: FiniteDuration = 1.hour, | |
blocksPercentage: Double = 0.35, | |
requestsPerSession: Int = 100, | |
updateTime: FiniteDuration = 5.minutes, | |
noPrint: Boolean = false | |
): SimulationResult = { | |
if(!noPrint) println(s"\n\n===\nRunning auth simulation for $filterName") | |
val authTransfersPerSession = sessionLength / updateTime | |
val totalBlockedUsers = (users * blocksPercentage).toInt | |
val totalTokens = (0 until users).map(_ => tokenId) | |
val (validTokens, blockedTokens) = totalTokens.splitAt(totalBlockedUsers) | |
val start = System.nanoTime() | |
filter.fill(blockedTokens) | |
val end = System.nanoTime() | |
val memoryFootprint = filter.size | |
val traffic = ((sessionLength / updateTime) * memoryFootprint).toInt | |
val startSearch = System.nanoTime() | |
val falsePositives = validTokens.count(token => filter.contains(token)) | |
val endSearch = System.nanoTime() | |
val falsePositivesRate = (falsePositives.toDouble / totalBlockedUsers) * 100 | |
val falsePositivesRequests = falsePositives * requestsPerSession | |
val falsePositivesTraffic = falsePositivesRequests * 20 // taking 20 bytes on average for a forbidden request | |
if (!noPrint) { | |
println(s"""Report for $filterName: | |
| Total/blocked users: $users / $totalBlockedUsers | |
| False positives: $falsePositives ($falsePositivesRate%) | |
| False positives requests: $falsePositivesRequests | |
| False positives traffic: $falsePositivesTraffic bytes | |
| Auth traffic: $traffic bytes (${(sessionLength / updateTime)} requests) | |
| Memory footprint: $memoryFootprint bytes | |
| Time to fill: ${end - start} ms | |
| Time to search: ${endSearch - startSearch} ms | |
|""".stripMargin) | |
} | |
SimulationResult( | |
filterName, | |
falsePositives, | |
falsePositivesRate, | |
falsePositivesRequests, | |
falsePositivesTraffic, | |
traffic, | |
memoryFootprint, | |
end - start, | |
endSearch - startSearch | |
) | |
} | |
def runAllSimulations( | |
users: Int = 10000, | |
sessionLength: FiniteDuration = 1.day, | |
blocksPercentage: Double = 0.25, | |
requestsPerSession: Int = 100, | |
updateTime: FiniteDuration = 5.minutes | |
): Unit = { | |
var results = Buffer.empty[SimulationResult] | |
results += runAuthSimulation(new SetFilterOps, "Set", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
// results += runAuthSimulation(new BloomFilterFilterOps, "Bloom filter", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
// results += runAuthSimulation(new ProbabilisticFilterFilterOps(CuckooFilter.create(Funnels.unencodedCharsFunnel(), maxSize)), "Cuckoo", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.BLOOM), "Bloom 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.BLOOM, 16), "Bloom 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.COUNTING_BLOOM), "Counting bloom", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_8), "Cuckoo 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_16), "Cuckoo 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_PLUS_8), "Cuckoo plus 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.CUCKOO_PLUS_16), "Cuckoo plus 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.XOR_8), "XOR 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.XOR_16), "XOR 16", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
results += runAuthSimulation(fastFilterOps(FilterType.XOR_PLUS_8), "XOR plus 8", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
// results += runAuthSimulation(fastFilterOps(FilterType.XOR_SIMPLE), "XOR simple fast", users, sessionLength, blocksPercentage, requestsPerSession, updateTime, true) | |
println(s"\n\n===\nRunning all simulations for users: $users, session length: $sessionLength, blocks percentage: $blocksPercentage, requests per session: $requestsPerSession, update time: $updateTime") | |
println("Name \tFP\tFP Rate\tFP Requests\tFPTraffic\tTraffic Total\tMemory footprint\tTime to fill\tTime to query") | |
results.foreach(r => | |
println(f""""${r.filterName}%30s"\t${r.falsePositives}\t${r.falsePositivesRate}%.2f%%\t ${r.falsePositivesRequests.toString}%6s\t${r.falsePositivesTraffic.toString}%10s\t${r.traffic.toString}%10s\t${r.memoryFootprint.toString}%16s\t${r.timeToFill.toString}%12s\t${r.timeToSearch.toString}%12s""") | |
) | |
val smallestFP = results.filterNot(_.filterName == "Set").minBy(_.falsePositives) | |
println(s"Smallest FP: ${smallestFP.filterName} (${smallestFP.falsePositives} bytes)") | |
val smallestMemory = results.minBy(_.memoryFootprint) | |
println(s"Smallest memory footprint: ${smallestMemory.filterName} (${smallestMemory.memoryFootprint} bytes)") | |
} | |
runAllSimulations(100) | |
runAllSimulations(100, blocksPercentage = 0.05) | |
runAllSimulations(100, blocksPercentage = 0.10) | |
// runAllSimulations(100, 6.hours) | |
// runAllSimulations(100, 2.days) | |
runAllSimulations(1000) | |
runAllSimulations(1000, blocksPercentage = 0.05) | |
runAllSimulations(1000, blocksPercentage = 0.10) | |
// runAllSimulations(1000, 6.hours) | |
// runAllSimulations(1000, 2.days) | |
runAllSimulations(10000) | |
runAllSimulations(10000, blocksPercentage = 0.05) | |
runAllSimulations(10000, blocksPercentage = 0.10) | |
runAllSimulations(10000, 6.hours) | |
runAllSimulations(10000, 2.days) | |
runAllSimulations(100000) | |
runAllSimulations(100000, blocksPercentage = 0.05) | |
runAllSimulations(100000, blocksPercentage = 0.10) | |
runAllSimulations(100000, 6.hours) | |
runAllSimulations(100000, 2.days) | |
runAllSimulations(1000000) | |
runAllSimulations(1000000, blocksPercentage = 0.05) | |
runAllSimulations(1000000, blocksPercentage = 0.10) | |
runAllSimulations(1000000, 6.hours) | |
runAllSimulations(1000000, 2.days) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment