drsimonj · July 3, 2024 04:24
diff --git a/traffic_interaction_simulations.R b/traffic_interaction_simulations.R
 library(tidyverse)

 # Simulation and test function --------------------------------------------

 #' n = number of user IDs to simulate
 #' p = Percentage of Experiment 1 traffic in treatment to exclude from experiment 2
 simulate_and_test <- function(n = 1000, p = 0) {
  #message("n = ", n, " and p = ", p)
  
  # Simulate a random data set of users in or out of two experiments
  d <- tibble(
    user_id = seq_len(n),
    exp_1 = sample(c("Control", "Treatment", NA), n, replace = TRUE),
    exp_2 = sample(c("Control", "Treatment", NA), n, replace = TRUE)
  )
  
  # If desired, for users in experiment 1 treatment, remove them from experiment 2 with probability = p
  # to simulate the relevant traffic interaction
  if(p > 0) {
    # Exclude p users from experiment 2 who are in experiment 1 treatment
    d <- d %>% 
      mutate(exp_2 = if_else(exp_1 == "Treatment" & !is.na(exp_2) & runif(n) < p,
                             NA, exp_2))
  }
  
  # Count exposure combinations
  combo_count <- d %>% 
    count(exp_1, exp_2)
  
  # First test: chi-square test of independence of experiment 1 variant and being in/out of experiment 2:
  cell_values <- combo_count %>% 
    filter(!is.na(exp_1)) %>% 
    mutate(exp_2 = if_else(is.na(exp_2), "Out", "In")) %>% 
    group_by(exp_1, exp_2) %>% 
    summarise(n = sum(n), .groups = "drop")
  
  user_flow_matrix <- matrix(cell_values$n, nrow = 2)
  dimnames(user_flow_matrix) <- list(exp_2  = unique(cell_values$exp_2),
                                     exp_1 = unique(cell_values$exp_1))
  user_flow_matrix
  
  p_value_indepedence <- chisq.test(user_flow_matrix)$p.value
  
  # Second test: chi-square test of goodness of fit of users being in each combo of experiment 1 and experiment 2:
  # Note we're assuming 50/50 split but this could be accomodated
  cell_values <- combo_count %>% 
    filter(!is.na(exp_1), !is.na(exp_2))
  
  combo_ns <- cell_values$n
  
  p_value_goodness <- chisq.test(x = combo_ns, p = c(.5*.5, .5*.5, .5*.5, .5*.5))$p.value
  
  # Return the p-values of each test
  c(independence_p = p_value_indepedence,
    goodness_p = p_value_goodness)
 }

 #' Assume we have two experiments: 1 and 2.
 #' Both split traffic 50/50 between two variants, Control and Treatment.
 #' Let's start with a case that everything is fine.
 #' We'll run 1000 iterations of simulated data and compare the results

 i <- 1000
 set.seed(20240603)
 sims_unbiased <- map_df(seq_len(i), ~ simulate_and_test())

 sims_unbiased %>% 
  ggplot(aes(independence_p, goodness_p)) +
    geom_point()

 sims_unbiased %>% 
  summarise(independence_fpr = sum(independence_p < .05)/n(),
            goodness_fpr = sum(goodness_p < .05)/n())

 # Both appear unbiased in their own right but they're completely uncorrelated.


 #' Let's now assume the same setup BUT that there's some interaction problem.
 #' Specifically, that traffic in the treatment of experiment 1 is less likely to
 #' get to experiment 2.
 #' We'll run 1000 iterations of simulated data and compare the results.
 #' Across these, we'll vary the percentage of traffic that gets excluded from .05 to .3
 i <- 1000
 set.seed(20240603)
 p <- runif(i, min = .05, max = .3)
 sims_biased <- map2_df(seq_len(i), p, ~ simulate_and_test(p = .y))

 sims_biased %>% 
  ggplot(aes(independence_p, goodness_p)) +
  geom_point()

 sims_biased %>% 
  ggplot(aes(independence_p)) +
    geom_histogram()

 sims_biased %>% 
  ggplot(aes(goodness_p)) +
  geom_histogram()

 sims_biased %>% 
  summarise(independence_fnr = sum(independence_p >= .05)/n(),
            goodness_fnr = sum(goodness_p >= .05)/n())

 # While both are detecting the bias, there's
 # a clear increase in false negatives for the goodness of fit test.
 # Suggesting that the test of independence is more effective for this case.

 # NOTE however, that the test of independence naturally has a higher sample size.
 # If you bump the sample sizes up in the simulation (e.g., n = 100,000), then the tests
 # perform equally well with the current range of values for p.
 # So, it's likely that at very high sample sizes, the problem is less concerning.
	library(tidyverse)

	# Simulation and test function --------------------------------------------

	#' n = number of user IDs to simulate
	#' p = Percentage of Experiment 1 traffic in treatment to exclude from experiment 2
	simulate_and_test <- function(n = 1000, p = 0) {
	#message("n = ", n, " and p = ", p)

	# Simulate a random data set of users in or out of two experiments
	d <- tibble(
	user_id = seq_len(n),
	exp_1 = sample(c("Control", "Treatment", NA), n, replace = TRUE),
	exp_2 = sample(c("Control", "Treatment", NA), n, replace = TRUE)
	)

	# If desired, for users in experiment 1 treatment, remove them from experiment 2 with probability = p
	# to simulate the relevant traffic interaction
	if(p > 0) {
	# Exclude p users from experiment 2 who are in experiment 1 treatment
	d <- d %>%
	mutate(exp_2 = if_else(exp_1 == "Treatment" & !is.na(exp_2) & runif(n) < p,
	NA, exp_2))
	}

	# Count exposure combinations
	combo_count <- d %>%
	count(exp_1, exp_2)

	# First test: chi-square test of independence of experiment 1 variant and being in/out of experiment 2:
	cell_values <- combo_count %>%
	filter(!is.na(exp_1)) %>%
	mutate(exp_2 = if_else(is.na(exp_2), "Out", "In")) %>%
	group_by(exp_1, exp_2) %>%
	summarise(n = sum(n), .groups = "drop")

	user_flow_matrix <- matrix(cell_values$n, nrow = 2)
	dimnames(user_flow_matrix) <- list(exp_2 = unique(cell_values$exp_2),
	exp_1 = unique(cell_values$exp_1))
	user_flow_matrix

	p_value_indepedence <- chisq.test(user_flow_matrix)$p.value

	# Second test: chi-square test of goodness of fit of users being in each combo of experiment 1 and experiment 2:
	# Note we're assuming 50/50 split but this could be accomodated
	cell_values <- combo_count %>%
	filter(!is.na(exp_1), !is.na(exp_2))

	combo_ns <- cell_values$n

	p_value_goodness <- chisq.test(x = combo_ns, p = c(.5.5, .5.5, .5.5, .5.5))$p.value

	# Return the p-values of each test
	c(independence_p = p_value_indepedence,
	goodness_p = p_value_goodness)
	}

	#' Assume we have two experiments: 1 and 2.
	#' Both split traffic 50/50 between two variants, Control and Treatment.
	#' Let's start with a case that everything is fine.
	#' We'll run 1000 iterations of simulated data and compare the results

	i <- 1000
	set.seed(20240603)
	sims_unbiased <- map_df(seq_len(i), ~ simulate_and_test())

	sims_unbiased %>%
	ggplot(aes(independence_p, goodness_p)) +
	geom_point()

	sims_unbiased %>%
	summarise(independence_fpr = sum(independence_p < .05)/n(),
	goodness_fpr = sum(goodness_p < .05)/n())

	# Both appear unbiased in their own right but they're completely uncorrelated.


	#' Let's now assume the same setup BUT that there's some interaction problem.
	#' Specifically, that traffic in the treatment of experiment 1 is less likely to
	#' get to experiment 2.
	#' We'll run 1000 iterations of simulated data and compare the results.
	#' Across these, we'll vary the percentage of traffic that gets excluded from .05 to .3
	i <- 1000
	set.seed(20240603)
	p <- runif(i, min = .05, max = .3)
	sims_biased <- map2_df(seq_len(i), p, ~ simulate_and_test(p = .y))

	sims_biased %>%
	ggplot(aes(independence_p, goodness_p)) +
	geom_point()

	sims_biased %>%
	ggplot(aes(independence_p)) +
	geom_histogram()

	sims_biased %>%
	ggplot(aes(goodness_p)) +
	geom_histogram()

	sims_biased %>%
	summarise(independence_fnr = sum(independence_p >= .05)/n(),
	goodness_fnr = sum(goodness_p >= .05)/n())

	# While both are detecting the bias, there's
	# a clear increase in false negatives for the goodness of fit test.
	# Suggesting that the test of independence is more effective for this case.

	# NOTE however, that the test of independence naturally has a higher sample size.
	# If you bump the sample sizes up in the simulation (e.g., n = 100,000), then the tests
	# perform equally well with the current range of values for p.
	# So, it's likely that at very high sample sizes, the problem is less concerning.