Tadge-Analytics · October 29, 2019 00:08
diff --git a/birthday_problem b/birthday_problem
 library(tidyverse)

 # Let's generate our different trials data
 # and save to rds

 test_if_sameness <- function(size_of_group) {
  tibble(person_id = 1:size_of_group) %>% 
    mutate(birthday = sample(1:365, n(), replace = T)) %>% 
    mutate(other_dates = list(rename(., person_id1 = person_id,
                                     birthday1 = birthday))) %>% 
    unnest(other_dates) %>% 
    filter(person_id != person_id1) %>% 
    filter(birthday == birthday1) %>% 
    nrow() > 0
 }



 bday_sample <- function(num_repeated_trials, sizes_of_groups, number_of_trials) {
  
  tibble(group_size = sizes_of_groups) %>% 
    crossing(trial_id = 1:number_of_trials,
             num_repeated_trials = 1:num_repeated_trials) %>% 
    mutate(outcome = furrr::future_map_lgl(group_size, ~test_if_sameness(.x))) %>% 
    group_by(group_size, num_repeated_trials) %>% 
    summarise(count_of_success = sum(outcome),
              percent_success = count_of_success/number_of_trials)
  
 }



 library(furrr)
 future::plan(multiprocess)

 set.seed(123)
 first <- bday_sample(1, 1:60, 1000)

 set.seed(123)
 second <- bday_sample(20, 22:24, 1000)

 set.seed(123)
 big_sample <- bday_sample(20, 23, 10000)



 write_rds(first, "first.rds")
 write_rds(second, "second.rds")
 write_rds(big_sample, "big_sample.rds")



 #############################################

 library(tidyverse)

 # let's start our analysis

 first <- read_rds("first.rds")
 second <- read_rds("second.rds")
 big_sample <- read_rds("big_sample.rds")



 first %>% 
  ggplot() +
  aes(group_size, percent_success) +
  geom_line(stat = "identity")



 data_for_plot <- second %>% 
  group_by(group_size) %>% 
  summarise(mean_success = mean(percent_success),
            sd_success = sd(percent_success),
            num_repeated_trials = n(),
            over_50 = sum(percent_success > 0.5),
            probability_chance_is_over_50 = over_50/num_repeated_trials)



 second %>% 
  ggplot() +
  aes(percent_success) +
  geom_histogram(aes(fill = percent_success > 0.5)) +
  facet_grid(. ~ group_size) +
  labs(fill = "Over 50%") +
  guides(fill = guide_legend(reverse=T)) +
  geom_vline(data = data_for_plot, aes(xintercept = mean_success), col = 'black', linetype = "dashed", alpha = 0.3, size = 1) +
  geom_label(data = data_for_plot, aes(mean_success, y = 3.5, label = paste0("mean = \n", round(mean_success, 3))))



 # Assuming normality, here are the distributions
 tibble(x = c(0.40, 0.60)) %>% 
  ggplot() +
  aes(x) +
  stat_function(fun = dnorm, args = list(mean = data_for_plot[[1, 2]], sd = data_for_plot[[1, 3]]), col = "red") +
  stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]), col = "blue") +
  stat_function(fun = dnorm, args = list(mean = data_for_plot[[3, 2]], sd = data_for_plot[[3, 3]]), col = "black")




 # for shading area under the curve.
 stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]),
                xlim = c(0.474, 0.545), geom = "area", fill = "blue", alpha = .2) +


  
  
    
 # for comparison of 23-group with 1,000 trials to the 10,000 trials    
 new_data_for_plot <- bind_rows(data_for_plot %>% filter(group_size == 23), 
                                   big_sample %>% 
                                     group_by(group_size) %>% 
                                     summarise(mean_success = mean(percent_success),
                                               sd_success = sd(percent_success),
                                               num_repeated_trials = n(),
                                               over_50 = sum(percent_success > 0.5),
                                               probability_chance_is_over_50 = over_50/num_repeated_trials))
    

 tibble(x = c(0.40, 0.60)) %>% 
  ggplot() +
  aes(x) +
  stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[1, 2]], sd = new_data_for_plot[[1, 3]]), col = "blue") +
  stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[2, 2]], sd = new_data_for_plot[[2, 3]]), col = "red")    



 # function for working out area under the curve
 pnorm()
	library(tidyverse)

	# Let's generate our different trials data
	# and save to rds

	test_if_sameness <- function(size_of_group) {
	tibble(person_id = 1:size_of_group) %>%
	mutate(birthday = sample(1:365, n(), replace = T)) %>%
	mutate(other_dates = list(rename(., person_id1 = person_id,
	birthday1 = birthday))) %>%
	unnest(other_dates) %>%
	filter(person_id != person_id1) %>%
	filter(birthday == birthday1) %>%
	nrow() > 0
	}



	bday_sample <- function(num_repeated_trials, sizes_of_groups, number_of_trials) {

	tibble(group_size = sizes_of_groups) %>%
	crossing(trial_id = 1:number_of_trials,
	num_repeated_trials = 1:num_repeated_trials) %>%
	mutate(outcome = furrr::future_map_lgl(group_size, ~test_if_sameness(.x))) %>%
	group_by(group_size, num_repeated_trials) %>%
	summarise(count_of_success = sum(outcome),
	percent_success = count_of_success/number_of_trials)

	}



	library(furrr)
	future::plan(multiprocess)

	set.seed(123)
	first <- bday_sample(1, 1:60, 1000)

	set.seed(123)
	second <- bday_sample(20, 22:24, 1000)

	set.seed(123)
	big_sample <- bday_sample(20, 23, 10000)



	write_rds(first, "first.rds")
	write_rds(second, "second.rds")
	write_rds(big_sample, "big_sample.rds")



	#############################################

	library(tidyverse)

	# let's start our analysis

	first <- read_rds("first.rds")
	second <- read_rds("second.rds")
	big_sample <- read_rds("big_sample.rds")



	first %>%
	ggplot() +
	aes(group_size, percent_success) +
	geom_line(stat = "identity")



	data_for_plot <- second %>%
	group_by(group_size) %>%
	summarise(mean_success = mean(percent_success),
	sd_success = sd(percent_success),
	num_repeated_trials = n(),
	over_50 = sum(percent_success > 0.5),
	probability_chance_is_over_50 = over_50/num_repeated_trials)



	second %>%
	ggplot() +
	aes(percent_success) +
	geom_histogram(aes(fill = percent_success > 0.5)) +
	facet_grid(. ~ group_size) +
	labs(fill = "Over 50%") +
	guides(fill = guide_legend(reverse=T)) +
	geom_vline(data = data_for_plot, aes(xintercept = mean_success), col = 'black', linetype = "dashed", alpha = 0.3, size = 1) +
	geom_label(data = data_for_plot, aes(mean_success, y = 3.5, label = paste0("mean = \n", round(mean_success, 3))))



	# Assuming normality, here are the distributions
	tibble(x = c(0.40, 0.60)) %>%
	ggplot() +
	aes(x) +
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[1, 2]], sd = data_for_plot[[1, 3]]), col = "red") +
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]), col = "blue") +
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[3, 2]], sd = data_for_plot[[3, 3]]), col = "black")




	# for shading area under the curve.
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]),
	xlim = c(0.474, 0.545), geom = "area", fill = "blue", alpha = .2) +





	# for comparison of 23-group with 1,000 trials to the 10,000 trials
	new_data_for_plot <- bind_rows(data_for_plot %>% filter(group_size == 23),
	big_sample %>%
	group_by(group_size) %>%
	summarise(mean_success = mean(percent_success),
	sd_success = sd(percent_success),
	num_repeated_trials = n(),
	over_50 = sum(percent_success > 0.5),
	probability_chance_is_over_50 = over_50/num_repeated_trials))


	tibble(x = c(0.40, 0.60)) %>%
	ggplot() +
	aes(x) +
	stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[1, 2]], sd = new_data_for_plot[[1, 3]]), col = "blue") +
	stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[2, 2]], sd = new_data_for_plot[[2, 3]]), col = "red")



	# function for working out area under the curve
	pnorm()