Last active
July 3, 2024 04:24
-
-
Save drsimonj/2001a6461d5dd2a4fd3fbc68520397b0 to your computer and use it in GitHub Desktop.
Simulating traffic interactions as described in https://vista.io/blog/interaction-effects-in-online-experimentation to see which of two types of tests are best suited
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# Simulation and test function -------------------------------------------- | |
#' n = number of user IDs to simulate | |
#' p = Percentage of Experiment 1 traffic in treatment to exclude from experiment 2 | |
simulate_and_test <- function(n = 1000, p = 0) { | |
#message("n = ", n, " and p = ", p) | |
# Simulate a random data set of users in or out of two experiments | |
d <- tibble( | |
user_id = seq_len(n), | |
exp_1 = sample(c("Control", "Treatment", NA), n, replace = TRUE), | |
exp_2 = sample(c("Control", "Treatment", NA), n, replace = TRUE) | |
) | |
# If desired, for users in experiment 1 treatment, remove them from experiment 2 with probability = p | |
# to simulate the relevant traffic interaction | |
if(p > 0) { | |
# Exclude p users from experiment 2 who are in experiment 1 treatment | |
d <- d %>% | |
mutate(exp_2 = if_else(exp_1 == "Treatment" & !is.na(exp_2) & runif(n) < p, | |
NA, exp_2)) | |
} | |
# Count exposure combinations | |
combo_count <- d %>% | |
count(exp_1, exp_2) | |
# First test: chi-square test of independence of experiment 1 variant and being in/out of experiment 2: | |
cell_values <- combo_count %>% | |
filter(!is.na(exp_1)) %>% | |
mutate(exp_2 = if_else(is.na(exp_2), "Out", "In")) %>% | |
group_by(exp_1, exp_2) %>% | |
summarise(n = sum(n), .groups = "drop") | |
user_flow_matrix <- matrix(cell_values$n, nrow = 2) | |
dimnames(user_flow_matrix) <- list(exp_2 = unique(cell_values$exp_2), | |
exp_1 = unique(cell_values$exp_1)) | |
user_flow_matrix | |
p_value_indepedence <- chisq.test(user_flow_matrix)$p.value | |
# Second test: chi-square test of goodness of fit of users being in each combo of experiment 1 and experiment 2: | |
# Note we're assuming 50/50 split but this could be accomodated | |
cell_values <- combo_count %>% | |
filter(!is.na(exp_1), !is.na(exp_2)) | |
combo_ns <- cell_values$n | |
p_value_goodness <- chisq.test(x = combo_ns, p = c(.5*.5, .5*.5, .5*.5, .5*.5))$p.value | |
# Return the p-values of each test | |
c(independence_p = p_value_indepedence, | |
goodness_p = p_value_goodness) | |
} | |
#' Assume we have two experiments: 1 and 2. | |
#' Both split traffic 50/50 between two variants, Control and Treatment. | |
#' Let's start with a case that everything is fine. | |
#' We'll run 1000 iterations of simulated data and compare the results | |
i <- 1000 | |
set.seed(20240603) | |
sims_unbiased <- map_df(seq_len(i), ~ simulate_and_test()) | |
sims_unbiased %>% | |
ggplot(aes(independence_p, goodness_p)) + | |
geom_point() | |
sims_unbiased %>% | |
summarise(independence_fpr = sum(independence_p < .05)/n(), | |
goodness_fpr = sum(goodness_p < .05)/n()) | |
# Both appear unbiased in their own right but they're completely uncorrelated. | |
#' Let's now assume the same setup BUT that there's some interaction problem. | |
#' Specifically, that traffic in the treatment of experiment 1 is less likely to | |
#' get to experiment 2. | |
#' We'll run 1000 iterations of simulated data and compare the results. | |
#' Across these, we'll vary the percentage of traffic that gets excluded from .05 to .3 | |
i <- 1000 | |
set.seed(20240603) | |
p <- runif(i, min = .05, max = .3) | |
sims_biased <- map2_df(seq_len(i), p, ~ simulate_and_test(p = .y)) | |
sims_biased %>% | |
ggplot(aes(independence_p, goodness_p)) + | |
geom_point() | |
sims_biased %>% | |
ggplot(aes(independence_p)) + | |
geom_histogram() | |
sims_biased %>% | |
ggplot(aes(goodness_p)) + | |
geom_histogram() | |
sims_biased %>% | |
summarise(independence_fnr = sum(independence_p >= .05)/n(), | |
goodness_fnr = sum(goodness_p >= .05)/n()) | |
# While both are detecting the bias, there's | |
# a clear increase in false negatives for the goodness of fit test. | |
# Suggesting that the test of independence is more effective for this case. | |
# NOTE however, that the test of independence naturally has a higher sample size. | |
# If you bump the sample sizes up in the simulation (e.g., n = 100,000), then the tests | |
# perform equally well with the current range of values for p. | |
# So, it's likely that at very high sample sizes, the problem is less concerning. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment