Created
September 7, 2023 00:22
-
-
Save pyricau/07fd9598c5cdec0bc9f62505b6329df7 to your computer and use it in GitHub Desktop.
A Kotlin script to compare the output of two Macrobenchmark runs, validating the data and computing the confidence interval for a difference between two means
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env kotlin | |
@file:Repository("https://repo.maven.apache.org/maven2/") | |
@file:DependsOn("com.datumbox:datumbox-framework-lib:0.8.2") | |
@file:DependsOn("com.squareup.okio:okio:3.3.0") | |
@file:DependsOn("com.squareup.moshi:moshi:1.13.0") | |
@file:DependsOn("com.squareup.moshi:moshi-adapters:1.13.0") | |
@file:DependsOn("com.squareup.moshi:moshi-kotlin:1.13.0") | |
import com.squareup.moshi.Moshi | |
import com.squareup.moshi.kotlin.reflect.KotlinJsonAdapterFactory | |
import okio.FileSystem | |
import okio.Path | |
import okio.Path.Companion.toPath | |
import okio.buffer | |
import kotlin.math.pow | |
import com.datumbox.framework.common.dataobjects.FlatDataCollection | |
import com.datumbox.framework.core.statistics.nonparametrics.onesample.ShapiroWilk | |
import java.text.DecimalFormat | |
import kotlin.math.roundToInt | |
import kotlin.math.sqrt | |
typealias DoubleRange = ClosedFloatingPointRange<Double> | |
check(args.size == 2) { | |
"Expecting two files." | |
} | |
val pathToBenchmarkJsonFile1 = args[0].toPath() | |
val pathToBenchmarkJsonFile2 = args[1].toPath() | |
val analysis1 = args[0].toPath().parseMacrobenchmarkJson() | |
val analysis2 = args[1].toPath().parseMacrobenchmarkJson() | |
val comparison = compare(analysis1, analysis2) | |
for ((testName, metrics) in comparison.metricComparisonsByTest.entries) { | |
println("###########################################################################") | |
println("Results for $testName") | |
for ((metricName, comparison) in metrics.entries) { | |
println("##################################################") | |
println(metricName) | |
// zScore for confidence level 95% | |
val zScore = 1.96 | |
val confidenceInterval = comparison.computeConfidenceInterval(zScore) | |
val meanDifferenceRange = confidenceInterval.meanDifferenceRange | |
val meanDifferencePercentRange = confidenceInterval.meanDifferencePercentRange | |
val twoDecimals = DecimalFormat("#.##") | |
println("#########################") | |
println("DATA CHECKS") | |
if (comparison.allChecksPass) { | |
println("✓ All checks passed, the comparison conclusion is meaningful.\n") | |
} else { | |
println("˟ Some checks did not pass, the comparison conclusion is NOT meaningful.\n") | |
} | |
println( | |
""" | |
Data checks for Benchmark 1 | |
- ${comparison.metric1.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric1.sampleSize}) | |
- ${comparison.metric1.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric1.coefficientOfVariation * 100)}) <= 6% | |
- ${comparison.metric1.checkLatenciesPassNormalityTest.check()} Latencies pass normality test | |
Data checks for Benchmark 2 | |
- ${comparison.metric2.checkEnoughIterations.check()} At least 30 iterations (${comparison.metric2.sampleSize}) | |
- ${comparison.metric2.checkCoefficientOfVariationLowEnough.check()} CV (${twoDecimals.format(comparison.metric2.coefficientOfVariation * 100)}) <= 6% | |
- ${comparison.metric2.checkLatenciesPassNormalityTest.check()} Latencies pass normality test | |
- ${comparison.checkVarianceLessThanDouble.check()} Variance less than doubles (${twoDecimals.format(comparison.varianceRatio)}) | |
######################### | |
""".trimIndent() | |
) | |
println("RESULT") | |
println("Mean difference confidence interval at 95% confidence level:") | |
when { | |
0.0 in meanDifferenceRange -> { | |
println( | |
"The change yielded no statistical significance (the mean difference confidence interval crosses 0): " | |
+ "from " | |
+ "${meanDifferenceRange.start.roundToInt()} ms (${ | |
twoDecimals.format( | |
meanDifferencePercentRange.start * 100 | |
) | |
}%) " | |
+ "to " | |
+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${ | |
twoDecimals.format( | |
meanDifferencePercentRange.endInclusive * 100 | |
) | |
}%)." | |
) | |
} | |
meanDifferenceRange.endInclusive < 0.0 -> { | |
println( | |
"The change yielded a mean improvement of " | |
+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${ | |
twoDecimals.format( | |
meanDifferencePercentRange.endInclusive * 100 | |
) | |
}%) " | |
+ "to " | |
+ "${meanDifferenceRange.start.roundToInt()} ms (${ | |
twoDecimals.format( | |
meanDifferencePercentRange.start * 100 | |
) | |
}%)." | |
) | |
} | |
else -> { | |
println( | |
"The change yielded a mean regression of " | |
+ "${meanDifferenceRange.start.roundToInt()} ms (${ | |
twoDecimals.format( | |
meanDifferencePercentRange.start * 100 | |
) | |
}%) " | |
+ "to " | |
+ "${meanDifferenceRange.endInclusive.roundToInt()} ms (${ | |
twoDecimals.format( | |
meanDifferencePercentRange.endInclusive * 100 | |
) | |
}%)." | |
) | |
} | |
} | |
println("#########################") | |
println("MEDIANS") | |
println("The median went from ${comparison.metric1.median.roundToInt()} ms to ${comparison.metric2.median.roundToInt()} ms.") | |
println("DO NOT REPORT THE DIFFERENCE IN MEDIANS.") | |
println("This data helps contextualize results but is not statistically meaningful.") | |
println("#########################") | |
} | |
} | |
fun Boolean.check() = if (this) "✓" else "˟" | |
fun Path.parseMacrobenchmarkJson(): BenchmarksData { | |
val jsonSource = FileSystem.SYSTEM | |
.source(this) | |
.buffer() | |
.readUtf8() | |
val jsonParser: Moshi = Moshi.Builder() | |
.add(KotlinJsonAdapterFactory()) | |
.build() | |
return jsonParser | |
.adapter(BenchmarksData::class.java) | |
.fromJson(jsonSource) as BenchmarksData | |
} | |
fun List<Double>.variance(): Double { | |
var sum = 0.0 | |
val mean = average() | |
forEach { value -> | |
sum += (value - mean).pow(2) | |
} | |
return sum / size | |
} | |
data class BenchmarksData( | |
val benchmarks: List<Benchmark> | |
) | |
data class Benchmark( | |
val name: String, | |
val className: String, | |
val metrics: Map<String, Metric>, | |
val repeatIterations: Int | |
) { | |
val testName by lazy { | |
"${className}#${name}" | |
} | |
} | |
data class Metric( | |
val runs: List<Double> | |
) { | |
val mean by lazy { | |
runs.average() | |
} | |
val median by lazy { | |
runs.p(50) | |
} | |
val variance by lazy { | |
runs.variance() | |
} | |
val sampleSize: Int | |
get() = runs.size | |
val standardDeviation by lazy { | |
sqrt(variance) | |
} | |
val coefficientOfVariation by lazy { | |
standardDeviation / mean | |
} | |
val checkEnoughIterations by lazy { | |
sampleSize >= 30 | |
} | |
val checkLatenciesPassNormalityTest by lazy { | |
// null hypothesis: the distribution is normal. | |
// alpha level (5%): probability of wrongly rejecting the hypothesis that the distribution is normal (null hypothesis). | |
val alphaLevel = 0.05 | |
val rejectNullHypothesis = ShapiroWilk.test(FlatDataCollection(runs), alphaLevel) | |
!rejectNullHypothesis | |
} | |
val checkCoefficientOfVariationLowEnough by lazy { | |
coefficientOfVariation <= 0.06 | |
} | |
private fun List<Double>.variance(): Double { | |
var sum = 0.0 | |
val mean = average() | |
forEach { value -> | |
sum += (value - mean).pow(2) | |
} | |
return sum / size | |
} | |
private fun List<Double>.p(percentile: Int): Double { | |
val idealIndex = percentile.coerceIn(0, 100) / 100.0 * (size - 1) | |
val firstIndex = idealIndex.toInt() | |
val secondIndex = firstIndex + 1 | |
val firstValue = this[firstIndex] | |
val secondValue = getOrElse(secondIndex) { firstValue } | |
return lerp(firstValue, secondValue, idealIndex - firstIndex) | |
} | |
/** lerp is a classic function name for linear interpolation */ | |
private fun lerp( | |
a: Double, | |
b: Double, | |
ratio: Double | |
): Double { | |
return (a * (1 - ratio) + b * (ratio)) | |
} | |
} | |
data class PairedBenchmarkComparison( | |
val benchmarkData1: BenchmarksData, | |
val benchmarkData2: BenchmarksData, | |
val metricComparisonsByTest: Map<String, Map<String, MetricComparison>> | |
) | |
data class MetricComparison( | |
val metric1: Metric, | |
val metric2: Metric | |
) { | |
val varianceRatio by lazy { | |
metric2.variance / metric1.variance | |
} | |
val checkVarianceLessThanDouble by lazy { | |
varianceRatio in 0.5..2.0 | |
} | |
val allChecksPass by lazy { | |
metric1.checkEnoughIterations && | |
metric1.checkCoefficientOfVariationLowEnough && | |
metric1.checkLatenciesPassNormalityTest && | |
metric2.checkEnoughIterations && | |
metric2.checkCoefficientOfVariationLowEnough && | |
metric2.checkLatenciesPassNormalityTest && | |
checkVarianceLessThanDouble | |
} | |
val pooledEstimateOfStandardDeviation by lazy { | |
val sizeMinusOne1 = metric1.sampleSize - 1 | |
val sizeMinusOne2 = metric2.sampleSize - 1 | |
sqrt( | |
(sizeMinusOne1 * metric1.variance + sizeMinusOne2 * metric2.variance) | |
/ | |
(sizeMinusOne1 + sizeMinusOne2) | |
) | |
} | |
val standardError by lazy { | |
pooledEstimateOfStandardDeviation * sqrt((1.0 / metric1.sampleSize) + (1.0 / metric2.sampleSize)) | |
} | |
fun computeConfidenceInterval(zScore: Double) = ConfidenceInterval(zScore, this) | |
} | |
class ConfidenceInterval( | |
val zScore: Double, | |
val metrics: MetricComparison | |
) { | |
val errorMargin by lazy { | |
zScore * metrics.standardError | |
} | |
val range by lazy { | |
errorMargin * 2 | |
} | |
val meanDifference by lazy { | |
metrics.metric2.mean - metrics.metric1.mean | |
} | |
val meanDifferenceRange by lazy { | |
(meanDifference - errorMargin).rangeTo(meanDifference + errorMargin) | |
} | |
val meanDifferencePercentRange by lazy { | |
(meanDifferenceRange.start / metrics.metric1.mean).rangeTo(meanDifferenceRange.endInclusive / metrics.metric1.mean) | |
} | |
} | |
fun compare( | |
benchmarkData1: BenchmarksData, | |
benchmarkData2: BenchmarksData | |
): PairedBenchmarkComparison { | |
val tests1 = benchmarkData1.benchmarks.associateBy { it.testName } | |
val tests2 = benchmarkData2.benchmarks.associateBy { it.testName } | |
check(tests1.keys == tests2.keys) { | |
"Expected exact same set of tests between ${tests1.keys} and ${tests2.keys}" | |
} | |
val testsWithPairedData = tests1.mapValues { (testName, benchmark1) -> | |
val benchmark2 = tests2.getValue(testName) | |
check(benchmark1.metrics.keys == benchmark2.metrics.keys) { | |
"Expected exact same set of metrics for $testName between ${benchmark1.metrics.keys} and ${benchmark2.metrics.keys}" | |
} | |
benchmark1.metrics.mapValues { (metricName, metric1) -> | |
val metric2 = benchmark2.metrics.getValue(metricName) | |
MetricComparison(metric1, metric2) | |
} | |
} | |
return PairedBenchmarkComparison(benchmarkData1, benchmarkData2, testsWithPairedData) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment