library(ggplot2)
library(dplyr)

options(scipen=999)

N <- 100000  # Number of simulations

# Data:

onsets <- tibble(
  onset = c("b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h",
            "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s"),
  count = c(16, 16, 18, 9, 21, 19, 23, 24, 18, 18, 19, 13, 14, 14, 20,
            18, 19, 14, 17, 16, 16),
  prob  = count / sum(count))

# Assuming that color terms have random onsets
# (H0), how likely is it that warm colors (yellow,
# red) share the same onset and cold colors (blue,
# green) share the same onset?

sapply(1:N, function(i) {
  with(onsets,
       sample(onset, 4, replace=TRUE, prob=prob))
}) |>
  t() |>
  data.frame() |>
  dplyr::rename(yellow=X1, red=X2, blue=X3, green=X4) -> sim

# Count cases where “yellow” and “red”, and “blue”
# and “green” share the same onset:

sim |>
  filter(
    yellow == red,
    blue   == green,
    yellow != green) |>
  nrow() -> i

# p-value:
i/N -> p1

# Assuming that “black” and “white” have random
# onsets (H0), how likely is it that they start
# with onsets that are extremely dissimilar with
# respect to place and mode of articulation?

# Assign random onsets to black and white and
# simulate N time:

sapply(1:N, function(i) {
  with(onsets,
       sample(onset, 2, replace=TRUE, prob=prob))
}) |>
  t() |>
  data.frame() |>
  dplyr::rename(black=X1, white=X2) -> sim

# Count cases with extremely dissimilar onsets:
sim |>
  filter(
    case_when(
      black %in% c("p", "b") & white %in% c("h")           ~ TRUE,
      black %in% c("m")      & white %in% c("g", "k", "h") ~ TRUE,
      black %in% c("g", "k") & white %in% c("m", "f")      ~ TRUE,
      black %in% c("f")      & white %in% c("g", "k")      ~ TRUE,
      white %in% c("p", "b") & black %in% c("h")           ~ TRUE,
      white %in% c("m")      & black %in% c("g", "k", "h") ~ TRUE,
      white %in% c("g", "k") & black %in% c("m", "f")      ~ TRUE,
      white %in% c("f")      & black %in% c("g", "k")      ~ TRUE,
      .default = FALSE)
  ) |>
  nrow() -> i

# p-value:
i/N  -> p2

p1 * p2 -> p3

message("p1: ", p1)
message("p2: ", p2)
message("p3: ", p3)