title | author | date |
---|---|---|
Simulating A/B Tests |
Ed Berry |
04/10/2019 |
library(broom)
library(janitor)
library(data.table)
library(tidyverse)
library(scales)
#======================================
# simulate a week of a test
#======================================
sim_week <- function(baseline, treatment_effect, n_per_week) {
data.table(control = rbinom(n_per_week, 1, baseline),
treatment = rbinom(n_per_week, 1, baseline + treatment_effect))
}
#======================================
# simulate weeks of testing
#======================================
sim_weeks <- function(baseline, treatment_effect, n_per_week, n_weeks) {
# create list where each element is the data
# for a week created by sim_week()
weeks_list = replicate(n_weeks,
sim_week(baseline, treatment_effect, n_per_week),
simplify = F)
# collapse the list of weeks into a
# data.table with a week id column
weeks_dt <- rbindlist(weeks_list, idcol = 'week')
weeks_dt %>%
# melt into long format with week as the id
melt(
id.vars = 'week',
variable.name = 'group',
value.name = 'success'
) %>%
# sum success flag by week and group (treat vs control)
.[, .(successes_per_week = sum(success)), by = c('week', 'group')] %>%
# create the cumulative successes for each week by group
.[, successes := cumsum(successes_per_week), by = 'group'] %>%
# create the cumulative failures by week
.[, failures := week * n_per_week - successes] %>%
# order by week
.[order(week)]
}
#======================================
# simulate weeks of testing with
# a prop.test for each week
#======================================
sim_test <- function(baseline, treatment_effect, n_per_week, n_weeks) {
sim_result <- sim_weeks(baseline, treatment_effect, n_per_week, n_weeks)
# create an empty dt to populate below
dt_prop_test_result = data.table(
week = 1:n_weeks,
control_prop = vector(mode = 'numeric', length = n_weeks),
treatment_prop = vector(mode = 'numeric', length = n_weeks),
p_value = vector(mode = 'numeric', length = n_weeks)
)
for (w in 1:n_weeks) {
# create a matrix with successes and fails in
# the columns control and treatment in the rows
mat = as.matrix(sim_result[week == w, .(successes, failures)])
# do the prop.test
prop_test_result = prop.test(mat)
# update the results data.table
dt_prop_test_result[week == w, `:=`(
control_prop = prop_test_result$estimate[1],
treatment_prop = prop_test_result$estimate[2],
p_value = prop_test_result$p.value
)]
}
return(dt_prop_test_result)
}
#======================================
# simulate a set of tests of the same
# type using the above functions
#======================================
sim_tests <- function(baseline, treatment_effect, n_per_week, n_weeks, n_tests) {
# replicate the test n_tests times
tests_list = replicate(
n_tests,
sim_test(baseline, treatment_effect, n_per_week, n_weeks),
simplify = F
)
# return a data.table with an id col called test
rbindlist(tests_list, idcol = 'test')
}
baseline = 0.5
treatment_effect = 0
n_per_week = 1000
n_weeks = 4
n_tests = 400
df_tests <- sim_tests(baseline, treatment_effect, n_per_week, n_weeks, n_tests) %>%
as_tibble()
df_tests_stop <- df_tests %>%
mutate(stop = as.numeric(p_value < 0.05)) %>%
group_by(test) %>%
mutate(stop_p_value = first(p_value, desc(stop)))
mean(unique(df_tests_stop$stop_p_value) < 0.05)
Overall across all the tests we have a r percent(mean(df_tests$p_value < 0.05))
false positive rate.
However, once we add early stopping into the mix the error rate goes up.
If we look at the first trial where p < 0.05, or the last trial for non-significant trials, then the error rates goes up to r percent(mean(unique(df_tests_stop$stop_p_value) < 0.05))
This chunk includes some functions to simulate A-B Tests using tidyverse
functions.
Benchmarking showed that these functions were slower so they won't be used.
They're being left for posterity.
sim_week <- function(baseline, treatment_effect, n_per_week) {
data.frame(
control = rbinom(n_per_week, 1, baseline),
treatment = rbinom(n_per_week, 1, baseline + treatment_effect)
)
}
sim_weeks <- function(baseline, treatment_effect, n_per_week, n_weeks) {
week_results <- replicate(n_weeks, sim_week(baseline, treatment_effect, n_per_week), simplify = F) %>%
dplyr::bind_rows(.id = 'week')
week_results %>%
pivot_longer(-week, names_to = 'group') %>%
group_by(week, group) %>%
summarise(success = sum(value), fail = sum(value == 0)) %>%
group_by(group) %>%
mutate(success = cumsum(success),
fail = cumsum(fail)) %>%
ungroup()
}
sim_prop_test <- function(baseline, treatment_effect, n_per_week, n_weeks) {
sim_result <- sim_weeks(baseline, treatment_effect, n_per_week, n_weeks)
sim_result %>%
select(-group) %>%
nest(data = c(success, fail)) %>%
mutate(prop_test_result = map(data, ~ prop.test(as.matrix(.x)) %>% tidy())) %>%
unnest(prop_test_result) %>%
rename( estimate_control = estimate1,
estimate_treatment = estimate2) %>%
select(-data)
}