Last active
July 20, 2017 15:00
-
-
Save ismayc/74eee31d4b9f63f9b62e9d7e411b5628 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(okcupiddata) | |
library(stringr) | |
# devtools::install_github("andrewpbray/infer") | |
library(infer) | |
library(dplyr) | |
library(ggplot2) | |
set.seed(2017) | |
prof_small <- profiles %>% | |
sample_n(size = 500) %>% | |
mutate(frisco = case_when( | |
str_detect(location, "san fran") ~ "san fran", | |
!str_detect(location, "san fran") ~ "not san fran" | |
)) %>% | |
select(age, sex, frisco, drugs, height, status) | |
## H_0: \mu_f - \mu_m = 0 | |
## H_A: \mu_f - \mu_m < 0 | |
# Calculate observed test statistic | |
obs_stat <- prof_small %>% | |
group_by(sex) %>% | |
summarize(mean_age = mean(age)) %>% | |
summarize(diff_mean = diff(mean_age)) %>% | |
pull(diff_mean) | |
# Not sold on using `pull()` here but without it, you'll | |
# need to use $ to get to obs_stat$diff_mean. | |
# To visualize currently without geom_vline() in visualize() | |
prof_small %>% | |
specify(age ~ sex) %>% # alt: response = age, explanatory = sex | |
hypothesize(null = "independence") %>% | |
generate(reps = 1000, type = "permute") %>% | |
calculate(stat = "diff in means") %>% | |
visualize() + | |
geom_vline(xintercept = obs_stat, color = "blue") | |
# Eventually this could look something like | |
prof_small %>% | |
specify(age ~ sex) %>% # alt: response = age, explanatory = sex | |
hypothesize(null = "independence") %>% | |
generate(reps = 1000, type = "permute") %>% | |
calculate(stat = "diff in means") %>% | |
# Totally spit-balling here | |
visualize(tail = "left", extreme_as = obs_stat) | |
# Or maybe it is better to save the null distribution | |
# And then plot and calculate the p-value? | |
null_distn <- prof_small %>% | |
specify(age ~ sex) %>% # alt: response = age, explanatory = sex | |
hypothesize(null = "independence") %>% | |
generate(reps = 1000, type = "permute") %>% | |
calculate(stat = "diff in means") | |
ggplot(data = null_distn, mapping = aes(x = stat)) + | |
geom_histogram(bins = 30, color = "white") + | |
geom_vline(xintercept = obs_stat, color = "blue") | |
# Calculate p-value | |
null_distn %>% | |
summarize(p_value = mean(stat <= obs_stat)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment