tmalsburg · September 27, 2023 10:15
diff --git a/chinese_color_terms.R b/chinese_color_terms.R

 library(ggplot2)
 library(dplyr)

 options(scipen=999)

 N <- 100000  # Number of simulations

 # Data:

 onsets <- tibble(
  onset = c("b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h",
            "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s"),
  count = c(16, 16, 18, 9, 21, 19, 23, 24, 18, 18, 19, 13, 14, 14, 20,
            18, 19, 14, 17, 16, 16),
  prob  = count / sum(count))

 # Assuming that color terms have random onsets
 # (H0), how likely is it that warm colors (yellow,
 # red) share the same onset and cold colors (blue,
 # green) share the same onset?

 sapply(1:N, function(i) {
  with(onsets,
       sample(onset, 4, replace=TRUE, prob=prob))
 }) |>
  t() |>
  data.frame() |>
  dplyr::rename(yellow=X1, red=X2, blue=X3, green=X4) -> sim

 # Count cases where “yellow” and “red”, and “blue”
 # and “green” share the same onset:

 sim |>
  filter(
    yellow == red,
    blue   == green,
    yellow != green) |>
  nrow() -> i

 # p-value:
 i/N -> p1

 # Assuming that “black” and “white” have random
 # onsets (H0), how likely is it that they start
 # with onsets that are extremely dissimilar with
 # respect to place and mode of articulation?

 # Assign random onsets to black and white and
 # simulate N time:

 sapply(1:N, function(i) {
  with(onsets,
       sample(onset, 2, replace=TRUE, prob=prob))
 }) |>
  t() |>
  data.frame() |>
  dplyr::rename(black=X1, white=X2) -> sim

 # Count cases with extremely dissimilar onsets:
 sim |>
  filter(
    case_when(
      black %in% c("p", "b") & white %in% c("h")           ~ TRUE,
      black %in% c("m")      & white %in% c("g", "k", "h") ~ TRUE,
      black %in% c("g", "k") & white %in% c("m", "f")      ~ TRUE,
      black %in% c("f")      & white %in% c("g", "k")      ~ TRUE,
      white %in% c("p", "b") & black %in% c("h")           ~ TRUE,
      white %in% c("m")      & black %in% c("g", "k", "h") ~ TRUE,
      white %in% c("g", "k") & black %in% c("m", "f")      ~ TRUE,
      white %in% c("f")      & black %in% c("g", "k")      ~ TRUE,
      .default = FALSE)
  ) |>
  nrow() -> i

 # p-value:
 i/N  -> p2

 p1 * p2 -> p3

 message("p1: ", p1)
 message("p2: ", p2)
 message("p3: ", p3)

	library(ggplot2)
	library(dplyr)

	options(scipen=999)

	N <- 100000 # Number of simulations

	# Data:

	onsets <- tibble(
	onset = c("b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h",
	"j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s"),
	count = c(16, 16, 18, 9, 21, 19, 23, 24, 18, 18, 19, 13, 14, 14, 20,
	18, 19, 14, 17, 16, 16),
	prob = count / sum(count))

	# Assuming that color terms have random onsets
	# (H0), how likely is it that warm colors (yellow,
	# red) share the same onset and cold colors (blue,
	# green) share the same onset?

	sapply(1:N, function(i) {
	with(onsets,
	sample(onset, 4, replace=TRUE, prob=prob))
	}) \|>
	t() \|>
	data.frame() \|>
	dplyr::rename(yellow=X1, red=X2, blue=X3, green=X4) -> sim

	# Count cases where “yellow” and “red”, and “blue”
	# and “green” share the same onset:

	sim \|>
	filter(
	yellow == red,
	blue == green,
	yellow != green) \|>
	nrow() -> i

	# p-value:
	i/N -> p1

	# Assuming that “black” and “white” have random
	# onsets (H0), how likely is it that they start
	# with onsets that are extremely dissimilar with
	# respect to place and mode of articulation?

	# Assign random onsets to black and white and
	# simulate N time:

	sapply(1:N, function(i) {
	with(onsets,
	sample(onset, 2, replace=TRUE, prob=prob))
	}) \|>
	t() \|>
	data.frame() \|>
	dplyr::rename(black=X1, white=X2) -> sim

	# Count cases with extremely dissimilar onsets:
	sim \|>
	filter(
	case_when(
	black %in% c("p", "b") & white %in% c("h") ~ TRUE,
	black %in% c("m") & white %in% c("g", "k", "h") ~ TRUE,
	black %in% c("g", "k") & white %in% c("m", "f") ~ TRUE,
	black %in% c("f") & white %in% c("g", "k") ~ TRUE,
	white %in% c("p", "b") & black %in% c("h") ~ TRUE,
	white %in% c("m") & black %in% c("g", "k", "h") ~ TRUE,
	white %in% c("g", "k") & black %in% c("m", "f") ~ TRUE,
	white %in% c("f") & black %in% c("g", "k") ~ TRUE,
	.default = FALSE)
	) \|>
	nrow() -> i

	# p-value:
	i/N -> p2

	p1 * p2 -> p3

	message("p1: ", p1)
	message("p2: ", p2)
	message("p3: ", p3)