mikekaminsky · May 6, 2020 01:37
diff --git a/statistical_paradoxes.R b/statistical_paradoxes.R
 library(dplyr)
 library(progress)
 library(foreach)
 library(doParallel)
 library(tidyr)
 library(magrittr)

 ##########################
 # Blog Post Example
 ##########################

 p_x <- 0.2
 R <- -0.1
 total_pop <- 1000000

 P_J_XJ1 <- p_x + p_x * (1 - p_x) * R
 P_J_XJ0 <- p_x - p_x * (1 - p_x) * R
 delta <- (P_J_XJ1 - P_J_XJ0)


 total_pop <- 1000000
 true_choice <- sample(c("Hawaiian", "Other"),
  size = total_pop,
  prob = c(p_x, 1 - p_x),
  replace = TRUE
 )

 true_prop <- true_choice %>%
  table() %>%
  as.list() %>%
  lapply(., function(x) x / length(true_choice))

 n_sims <- 200
 R <- -0.1

 sample_sizes <- c(50, 100, 1000, 10000, 100000, 500000)
 jj <- 1

 samp_strats <- c("srs", "biased")



 cores <- detectCores()
 cl <- makeCluster(pmin(cores[1] - 1, length(samp_strats)), outfile = "")
 registerDoParallel(cl)

 res_strats <- foreach(style = iter(samp_strats), .combine = plyr::rbind.fill) %dopar% {
  pb <- progress::progress_bar$new(
    format = "[:bar] :current/:total (:percent)",
    total = n_sims * length(sample_sizes)
  )
  pb$tick(0)
  library(magrittr)
  library(dplyr)
  res_l <- vector(mode = "list", length = n_sims * length(sample_sizes))
  for (sample_size in sample_sizes) {
    resp_p <- sample_size / total_pop
    res_p_t <- resp_p - (1 - p_x) * resp_p * R
    res_p_c <- resp_p + (1-(1 - p_x)) * resp_p * R
    for (i in seq(n_sims)) {
      if (style == "srs") {
        sample <- sample(true_choice, size = sample_size, replace = FALSE)
      } else {
        c_t <- rep(0, total_pop)
        # Who responds among people who like Hawaiian?
        c_t[true_choice == "Hawaiian"] <- sample(c(1, 0),
          length(c_t[true_choice == "Hawaiian"]),
          prob = c(res_p_t, 1 - res_p_t),
          replace = TRUE
        )
        # Who responds among people who like other?
        c_t[true_choice == "Other"] <- sample(c(1, 0),
          length(c_t[true_choice == "Other"]),
          prob = c(res_p_c, 1 - res_p_c),
          replace = TRUE
        )
        sample <- true_choice[c_t == 1]
      }
      df <- sample %>%
        table() %>%
        as.list() %>%
        lapply(., function(x) x / length(sample)) %>%
        data.frame() %>%
        dplyr::mutate(
          n = sample_size,
          type = style
        )
      res_l[[jj]] <- df
      jj <- jj + 1
      pb$tick()
    }
  }
  plyr::rbind.fill(res_l) %>%
    mutate(
      error = Hawaiian - true_prop$Hawaiian,
      error_pct = error / true_prop$Hawaiian
    )
 }

 # Compile statistics table
 res_strats %>%
  group_by(n, type) %>%
  summarize(
    rmse = sqrt(mean(error**2)),
    mape = mean(abs(error_pct))
  ) %>%
  pivot_wider(id_cols = "n", names_from = "type", values_from = c("rmse", "mape"))


 stopCluster(cl)
	library(dplyr)
	library(progress)
	library(foreach)
	library(doParallel)
	library(tidyr)
	library(magrittr)

	##########################
	# Blog Post Example
	##########################

	p_x <- 0.2
	R <- -0.1
	total_pop <- 1000000

	P_J_XJ1 <- p_x + p_x * (1 - p_x) * R
	P_J_XJ0 <- p_x - p_x * (1 - p_x) * R
	delta <- (P_J_XJ1 - P_J_XJ0)


	total_pop <- 1000000
	true_choice <- sample(c("Hawaiian", "Other"),
	size = total_pop,
	prob = c(p_x, 1 - p_x),
	replace = TRUE
	)

	true_prop <- true_choice %>%
	table() %>%
	as.list() %>%
	lapply(., function(x) x / length(true_choice))

	n_sims <- 200
	R <- -0.1

	sample_sizes <- c(50, 100, 1000, 10000, 100000, 500000)
	jj <- 1

	samp_strats <- c("srs", "biased")



	cores <- detectCores()
	cl <- makeCluster(pmin(cores[1] - 1, length(samp_strats)), outfile = "")
	registerDoParallel(cl)

	res_strats <- foreach(style = iter(samp_strats), .combine = plyr::rbind.fill) %dopar% {
	pb <- progress::progress_bar$new(
	format = "[:bar] :current/:total (:percent)",
	total = n_sims * length(sample_sizes)
	)
	pb$tick(0)
	library(magrittr)
	library(dplyr)
	res_l <- vector(mode = "list", length = n_sims * length(sample_sizes))
	for (sample_size in sample_sizes) {
	resp_p <- sample_size / total_pop
	res_p_t <- resp_p - (1 - p_x) * resp_p * R
	res_p_c <- resp_p + (1-(1 - p_x)) * resp_p * R
	for (i in seq(n_sims)) {
	if (style == "srs") {
	sample <- sample(true_choice, size = sample_size, replace = FALSE)
	} else {
	c_t <- rep(0, total_pop)
	# Who responds among people who like Hawaiian?
	c_t[true_choice == "Hawaiian"] <- sample(c(1, 0),
	length(c_t[true_choice == "Hawaiian"]),
	prob = c(res_p_t, 1 - res_p_t),
	replace = TRUE
	)
	# Who responds among people who like other?
	c_t[true_choice == "Other"] <- sample(c(1, 0),
	length(c_t[true_choice == "Other"]),
	prob = c(res_p_c, 1 - res_p_c),
	replace = TRUE
	)
	sample <- true_choice[c_t == 1]
	}
	df <- sample %>%
	table() %>%
	as.list() %>%
	lapply(., function(x) x / length(sample)) %>%
	data.frame() %>%
	dplyr::mutate(
	n = sample_size,
	type = style
	)
	res_l[[jj]] <- df
	jj <- jj + 1
	pb$tick()
	}
	}
	plyr::rbind.fill(res_l) %>%
	mutate(
	error = Hawaiian - true_prop$Hawaiian,
	error_pct = error / true_prop$Hawaiian
	)
	}

	# Compile statistics table
	res_strats %>%
	group_by(n, type) %>%
	summarize(
	rmse = sqrt(mean(error**2)),
	mape = mean(abs(error_pct))
	) %>%
	pivot_wider(id_cols = "n", names_from = "type", values_from = c("rmse", "mape"))


	stopCluster(cl)