Created
November 17, 2015 03:57
-
-
Save awreece/68e137c4d0fe570a2609 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import argparse | |
import blessed | |
from collections import defaultdict | |
import csv | |
import jinja2 | |
import logging | |
import numpy | |
import os | |
import re | |
from scipy import stats | |
import subprocess | |
import sys | |
from tempfile import NamedTemporaryFile | |
# Ensure that we can output color escape characters and utf-8. | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
term = blessed.Terminal() | |
def GetMeanStr(values, confidence): | |
"""Returns a string representing a confidence interval around the mean.""" | |
error = stats.sem(values)*stats.t.ppf(confidence, len(values)-1) | |
return "%.2f±%.2fms" % (numpy.mean(values) / 1000, error / 1000) | |
def GetBucketChar(count, maxCount): | |
blocks = [' ', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█'] | |
bi = int(float(count)/float(maxCount)*(len(blocks) - 1)) | |
# | |
# Deliberately show outliers, even if they would not have otherwise | |
# appeared. | |
# | |
if count > 0 and bi == 0: | |
bi = 1 | |
return blocks[bi] | |
def GetHistogramString(arr, **kwargs): | |
buckets, low_range, binsize, extrapoints = stats.histogram(arr, **kwargs) | |
hist = "%7.2fms : " % (low_range / 1000) | |
for count in buckets: | |
hist += GetBucketChar(count, max(buckets)) | |
hist += " : %7.2fms" % ((low_range + binsize * (len(buckets) + 1)) / 1000) | |
return hist | |
def LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions): | |
""" | |
Logs detailed information for the compilation and executions. | |
Summary: | |
Calculates detailed statistics (including histograms) for each of | |
the input arrays and emits them to `logging.info`. Deliberately | |
attempts to align the histograms and ensure the both execution | |
histograms use the same scale to improve readability. | |
Arguments: | |
args: The command line arguments containing the histogram parameters | |
and the confidence level. | |
newExecutions: An array of floats containing new execution times in | |
milliseconds. | |
oldExecutions: An array of floats containing old execution times in | |
milliseconds. | |
""" | |
# | |
# We take special care to ensure that the histograms will line up (same | |
# size bucket at same point on the screen). | |
# | |
minExecution = min(min(newExecutions), min(oldExecutions)) | |
maxExecution = max(max(newExecutions), max(oldExecutions)) | |
s = (1/2) * (maxExecution - minExecution) / (args.histogram_buckets - 1) | |
newExecutionHist = GetHistogramString( | |
newExecutions, | |
defaultlimits=(minExecution - s, maxExecution + s), | |
numbins=args.histogram_buckets) | |
oldExecutionHist = GetHistogramString( | |
oldExecutions, | |
defaultlimits=(minExecution - s, maxExecution + s), | |
numbins=args.histogram_buckets) | |
newExecutionMeanStr = "μ=" + GetMeanStr(newExecutions, args.confidence) | |
oldExecutionMeanStr = "μ=" + GetMeanStr(oldExecutions, args.confidence) | |
maxMeanLen = max(len(oldExecutionMeanStr), len(newExecutionMeanStr)) | |
maxVariantLen = max(len(oldVariant), len(newVariant)) | |
logging.info(term.blue("%-*s : %-*s : %s"), | |
maxVariantLen, oldVariant, maxMeanLen, oldExecutionMeanStr, | |
oldExecutionHist) | |
logging.info(term.cyan("%-*s : %-*s : %s"), | |
maxVariantLen, newVariant, maxMeanLen, newExecutionMeanStr, | |
newExecutionHist) | |
def DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions): | |
logging.debug("AB Testing %s (old) vs %s (new)", oldVariant, newVariant) | |
testPassed = True | |
adjustedOldExecutions = [ | |
(1 + args.max_regression) * v for v in oldExecutions | |
] | |
LogPerformanceStats(args, oldVariant, oldExecutions, newVariant, newExecutions) | |
# | |
# Run a Welch two sample t test to ensure that we have not regressed | |
# execution perf. | |
# | |
# While this test assumes normality, the Welch's variant does *not* | |
# assume homoscedasticity (i.e. both populations have the same | |
# variance). Other similar tests, such as the Mann-Whitney U test, | |
# are sensitive to this property: | |
# | |
# If the distributions are heteroscedastic, the Kruskal–Wallis test | |
# won't help you; instead, you should use Welch's t–test for two | |
# groups, or Welch's anova for more than two groups. | |
# | |
# http://www.biostathandbook.com/kruskalwallis.html | |
# | |
(_, p) = stats.ttest_ind(newExecutions, adjustedOldExecutions, | |
equal_var=False) | |
if p < 1 - args.confidence: | |
newExecutionsMean = numpy.mean(newExecutions) | |
adjustedOldExecutionsMean = numpy.mean(adjustedOldExecutions) | |
oldExecutionsMean = numpy.mean(oldExecutions) | |
if newExecutionsMean > adjustedOldExecutionsMean: | |
testPassed = False | |
regression = (newExecutionsMean - oldExecutionsMean) | |
regressionPct = (regression / oldExecutionsMean) * 100 | |
logging.error( | |
term.red("Execution regressed by %.1f%% (vs %.1f%%)"), | |
regressionPct, args.max_regression*100) | |
else: | |
logging.debug("Execution had too much variance to make conclusion") | |
# TODO Test 99th percentile | |
return testPassed | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="AB variant execution performance (old vs new).", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument("-v", "--verbosity", action="count", | |
help="Increase output verbosity") | |
parser.add_argument("--confidence", type=float, default=0.999, | |
help="Confidence interval (e.g. be 99.9%% confident " + | |
"of all reported values.") | |
parser.add_argument("--max-regression", type=float, default=0.02, | |
help="Maximum allowed execution regression (e.g. " + | |
"new execution must within 2%% of old execution).") | |
parser.add_argument("--old-variant", help="Test type to use as old test " + | |
"variant. By default, the first test type " + | |
"encountered is the old variant.") | |
parser.add_argument("--histogram-buckets", type=int, default=15, | |
help="Number of histogram buckets to use.") | |
args = parser.parse_args() | |
if args.verbosity >= 1: | |
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) | |
else: | |
logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
executions = defaultdict(list) | |
oldVariant = args.old_variant | |
for name, executionMicros in csv.reader(sys.stdin): | |
oldVariant = oldVariant if oldVariant else name | |
executions[name].append(float(executionMicros)) | |
assert len(executions) == 2, "Can only do AB test of two test variants." | |
oldExecutions, newExecutions = None, None | |
for name, values in executions.iteritems(): | |
if name == oldVariant: | |
oldExecutions = values | |
else: | |
newExecutions = values | |
newVariant = name | |
DoABTest(args, oldVariant, oldExecutions, newVariant, newExecutions) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment