Skip to content

Instantly share code, notes, and snippets.

@henrik
Last active May 25, 2016 19:34
Show Gist options
  • Save henrik/ceb1331bd7aba42ed7deaf906d208bc7 to your computer and use it in GitHub Desktop.
Save henrik/ceb1331bd7aba42ed7deaf906d208bc7 to your computer and use it in GitHub Desktop.
A/B test simulations for my own learning.
# Simulating running an A/B test several times to see false negatives vs. statistical power.
# https://github.com/bmuller/abanalyzer
# gem install abanalyzer
require "abanalyzer"
# A converts at 50%. B converts at 66.66…%.
treatment_a = -> { [ "converted", "unconverted" ].sample }
treatment_b = -> { [ "converted", "converted", "unconverted" ].sample }
percent = -> (n, total) { (n/total.to_f * 100).round(2) }
# http://www.evanmiller.org/ab-testing/sample-size.html
# Baseline conversion rate: 50%
# Minimum detectable effect: 16.66% absolute
statistical_power = 80 # %
significance_level = 0.05
sample_size = 139 # per variation
# Run 100 tests to see how many give the wrong result.
100.times do
a_conversions = 0
b_conversions = 0
sample_size.times do
a_conversions += 1 if treatment_a.call == "converted"
b_conversions += 1 if treatment_b.call == "converted"
end
# Fancy "G-test".
tester = ABAnalyzer::ABTest.new(
a: { converted: a_conversions, unconverted: sample_size - a_conversions },
b: { converted: b_conversions, unconverted: sample_size - b_conversions },
)
ab_different = tester.different?(significance_level)
unless ab_different
puts "False negative! Detected no diff where there is one. Should happen ~#{100 - statistical_power}/100 times."
end
end
# Simulating running an A/B test several times to see false positives vs. significance level.
# https://github.com/bmuller/abanalyzer
# gem install abanalyzer
require "abanalyzer"
# A and B both convert at 50%.
treatment_a = -> { [ "converted", "unconverted" ].sample }
treatment_b = -> { [ "converted", "unconverted" ].sample }
percent = -> (n, total) { (n/total.to_f * 100).round(2) }
# http://www.evanmiller.org/ab-testing/sample-size.html
# Baseline conversion rate: 50%
# Minimum detectable effect: 5%
significance_level = 0.05
sample_size = 1567
# Run 100 tests to see how many give the wrong result.
100.times do
a_conversions = 0
b_conversions = 0
sample_size.times do
a_conversions += 1 if treatment_a.call == "converted"
b_conversions += 1 if treatment_b.call == "converted"
end
# Fancy "G-test".
tester = ABAnalyzer::ABTest.new(
a: { converted: a_conversions, unconverted: sample_size - a_conversions },
b: { converted: b_conversions, unconverted: sample_size - b_conversions },
)
ab_different = tester.different?(significance_level)
if ab_different
puts "False positive! Detected a diff where there is none. Should happen ~#{significance_level * 100}/100 times."
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment