Skip to content

Instantly share code, notes, and snippets.

@awreece
Created November 17, 2015 03:57
Show Gist options
  • Save awreece/68e137c4d0fe570a2609 to your computer and use it in GitHub Desktop.
Save awreece/68e137c4d0fe570a2609 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import blessed
from collections import defaultdict
import csv
import jinja2
import logging
import numpy
import os
import re
from scipy import stats
import subprocess
import sys
from tempfile import NamedTemporaryFile
# Ensure that we can output color escape characters and utf-8.
reload(sys)
sys.setdefaultencoding("utf-8")
term = blessed.Terminal()
def GetMeanStr(values, confidence):
"""Returns a string representing a confidence interval around the mean."""
error = stats.sem(values)*stats.t.ppf(confidence, len(values)-1)
return "%.2f±%.2fms" % (numpy.mean(values) / 1000, error / 1000)
def GetBucketChar(count, maxCount):
blocks = [' ', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█']
bi = int(float(count)/float(maxCount)*(len(blocks) - 1))
#
# Deliberately show outliers, even if they would not have otherwise
# appeared.
#
if count > 0 and bi == 0:
bi = 1
return blocks[bi]
def GetHistogramString(arr, **kwargs):
buckets, low_range, binsize, extrapoints = stats.histogram(arr, **kwargs)
hist = "%7.2fms : " % (low_range / 1000)
for count in buckets:
hist += GetBucketChar(count, max(buckets))
hist += " : %7.2fms" % ((low_range + binsize * (len(buckets) + 1)) / 1000)
return hist
def LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions):
"""
Logs detailed information for the compilation and executions.
Summary:
Calculates detailed statistics (including histograms) for each of
the input arrays and emits them to `logging.info`. Deliberately
attempts to align the histograms and ensure the both execution
histograms use the same scale to improve readability.
Arguments:
args: The command line arguments containing the histogram parameters
and the confidence level.
newExecutions: An array of floats containing new execution times in
milliseconds.
oldExecutions: An array of floats containing old execution times in
milliseconds.
"""
#
# We take special care to ensure that the histograms will line up (same
# size bucket at same point on the screen).
#
minExecution = min(min(newExecutions), min(oldExecutions))
maxExecution = max(max(newExecutions), max(oldExecutions))
s = (1/2) * (maxExecution - minExecution) / (args.histogram_buckets - 1)
newExecutionHist = GetHistogramString(
newExecutions,
defaultlimits=(minExecution - s, maxExecution + s),
numbins=args.histogram_buckets)
oldExecutionHist = GetHistogramString(
oldExecutions,
defaultlimits=(minExecution - s, maxExecution + s),
numbins=args.histogram_buckets)
newExecutionMeanStr = "μ=" + GetMeanStr(newExecutions, args.confidence)
oldExecutionMeanStr = "μ=" + GetMeanStr(oldExecutions, args.confidence)
maxMeanLen = max(len(oldExecutionMeanStr), len(newExecutionMeanStr))
maxVariantLen = max(len(oldVariant), len(newVariant))
logging.info(term.blue("%-*s : %-*s : %s"),
maxVariantLen, oldVariant, maxMeanLen, oldExecutionMeanStr,
oldExecutionHist)
logging.info(term.cyan("%-*s : %-*s : %s"),
maxVariantLen, newVariant, maxMeanLen, newExecutionMeanStr,
newExecutionHist)
def DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions):
logging.debug("AB Testing %s (old) vs %s (new)", oldVariant, newVariant)
testPassed = True
adjustedOldExecutions = [
(1 + args.max_regression) * v for v in oldExecutions
]
LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions)
#
# Run a Welch two sample t test to ensure that we have not regressed
# execution perf.
#
# While this test assumes normality, the Welch's variant does *not*
# assume homoscedasticity (i.e. both populations have the same
# variance). Other similar tests, such as the Mann-Whitney U test,
# are sensitive to this property:
#
# If the distributions are heteroscedastic, the Kruskal–Wallis test
# won't help you; instead, you should use Welch's t–test for two
# groups, or Welch's anova for more than two groups.
#
# http://www.biostathandbook.com/kruskalwallis.html
#
(_, p) = stats.ttest_ind(newExecutions, adjustedOldExecutions,
equal_var=False)
if p < 1 - args.confidence:
newExecutionsMean = numpy.mean(newExecutions)
adjustedOldExecutionsMean = numpy.mean(adjustedOldExecutions)
oldExecutionsMean = numpy.mean(oldExecutions)
if newExecutionsMean > adjustedOldExecutionsMean:
testPassed = False
regression = (newExecutionsMean - oldExecutionsMean)
regressionPct = (regression / oldExecutionsMean) * 100
logging.error(
term.red("Execution regressed by %.1f%% (vs %.1f%%)"),
regressionPct, args.max_regression*100)
else:
logging.debug("Execution had too much variance to make conclusion")
# TODO Test 99th percentile
return testPassed
def main():
parser = argparse.ArgumentParser(
description="AB variant execution performance (old vs new).",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-v", "--verbosity", action="count",
help="Increase output verbosity")
parser.add_argument("--confidence", type=float, default=0.999,
help="Confidence interval (e.g. be 99.9%% confident " +
"of all reported values.")
parser.add_argument("--max-regression", type=float, default=0.02,
help="Maximum allowed execution regression (e.g. " +
"new execution must within 2%% of old execution).")
parser.add_argument("--old-variant", help="Test type to use as old test " +
"variant. By default, the first test type " +
"encountered is the old variant.")
parser.add_argument("--histogram-buckets", type=int, default=15,
help="Number of histogram buckets to use.")
args = parser.parse_args()
if args.verbosity >= 1:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
else:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
executions = defaultdict(list)
oldVariant = args.old_variant
for name, executionMicros in csv.reader(sys.stdin):
oldVariant = oldVariant if oldVariant else name
executions[name].append(float(executionMicros))
assert len(executions) == 2, "Can only do AB test of two test variants."
oldExecutions, newExecutions = None, None
for name, values in executions.iteritems():
if name == oldVariant:
oldExecutions = values
else:
newExecutions = values
newVariant = name
DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment