library(ggplot2) library(dplyr) options(scipen=999) N <- 100000 # Number of simulations # Data: onsets <- tibble( onset = c("b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s"), count = c(16, 16, 18, 9, 21, 19, 23, 24, 18, 18, 19, 13, 14, 14, 20, 18, 19, 14, 17, 16, 16), prob = count / sum(count)) # Assuming that color terms have random onsets # (H0), how likely is it that warm colors (yellow, # red) share the same onset and cold colors (blue, # green) share the same onset? sapply(1:N, function(i) { with(onsets, sample(onset, 4, replace=TRUE, prob=prob)) }) |> t() |> data.frame() |> dplyr::rename(yellow=X1, red=X2, blue=X3, green=X4) -> sim # Count cases where “yellow” and “red”, and “blue” # and “green” share the same onset: sim |> filter( yellow == red, blue == green, yellow != green) |> nrow() -> i # p-value: i/N -> p1 # Assuming that “black” and “white” have random # onsets (H0), how likely is it that they start # with onsets that are extremely dissimilar with # respect to place and mode of articulation? # Assign random onsets to black and white and # simulate N time: sapply(1:N, function(i) { with(onsets, sample(onset, 2, replace=TRUE, prob=prob)) }) |> t() |> data.frame() |> dplyr::rename(black=X1, white=X2) -> sim # Count cases with extremely dissimilar onsets: sim |> filter( case_when( black %in% c("p", "b") & white %in% c("h") ~ TRUE, black %in% c("m") & white %in% c("g", "k", "h") ~ TRUE, black %in% c("g", "k") & white %in% c("m", "f") ~ TRUE, black %in% c("f") & white %in% c("g", "k") ~ TRUE, white %in% c("p", "b") & black %in% c("h") ~ TRUE, white %in% c("m") & black %in% c("g", "k", "h") ~ TRUE, white %in% c("g", "k") & black %in% c("m", "f") ~ TRUE, white %in% c("f") & black %in% c("g", "k") ~ TRUE, .default = FALSE) ) |> nrow() -> i # p-value: i/N -> p2 p1 * p2 -> p3 message("p1: ", p1) message("p2: ", p2) message("p3: ", p3)