library(tidyverse)
library(infer)
## Generate some fake data that is similar to your actual data
set.seed(1)
resids <- tibble(
nationality = c(rep('English', 60), rep('non-English', 100), rep('African', 40)),
.resid = c(
rnorm(60, sd = 2) + runif(60, 0, 0.5),
rnorm(100, mean = 0.05, sd = 2) + runif(100, -3, 3),
rnorm(40, mean = -0.1, sd = 1) + runif(40, -2, 1)
)
)
resids |>
ggplot(aes(x = nationality, y = .resid)) +
geom_boxplot()two_groups <- c("English", "non-English")
two_group_resids <- resids |>
filter(nationality %in% two_groups) |>
mutate(nationality = factor(nationality))
## This is the more traditional calculation of p-value, with the infer::t_test wrapper for stats::t.test.
p_value_traditional <- two_group_resids |>
t_test(
formula = .resid ~ nationality,
order = two_groups,
## "two-sided" means we're testing for a statistically significant difference in general.
## "greater" or "less" means we're interested specifically in one group having
## a statistically significantly higher/lower average.
alternative = "two-sided"
)
## A p-value greater than 0.05 is telling us that there isn't a statistically significant difference
## between the English and non-English residuals, meaning that we cannot
## reasonably conclude that Fifa shooting ratings are biased favorably for English players.
p_value_traditional
#> # A tibble: 1 × 7
#> statistic t_df p_value alternative estimate lower_ci upper_ci
#> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1.20 154. 0.233 two.sided 0.395 -0.257 1.05
## We can use a more generalizable approach to calculate the p-value, where we simulate data from the distribution
## of the null hypothesis and essentially calculate the p-value as a percentile in
## the simulated distribution. We should come to the same conclusion as we find
## with the traditional approach, although our p-value will not be exactly the same.
observed_statistic <- two_group_resids |>
specify(response = .resid, explanatory = nationality) |>
calculate(stat = "diff in medians", order = two_groups)
observed_statistic
#> Response: .resid (numeric)
#> Explanatory: nationality (factor)
#> # A tibble: 1 × 1
#> stat
#> <dbl>
#> 1 0.473
null_dist_simulated <- two_group_resids |>
specify(response = .resid, explanatory = nationality) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = two_groups)
## The red bar corresponds to the value of observed_statistic, the "t-stat" value
null_dist_simulated %>%
visualize() +
shade_p_value(observed_statistic, direction = "two-sided")## If observed_statistic were outside 2.5-97.5th percentile range, then the p-value would be less than 0.05.
c(quantile(null_dist_simulated$stat, 0.025), quantile(null_dist_simulated$stat, 0.975))
#> 2.5% 97.5%
#> -0.6629786 0.6702365
p_value_simulated <- null_dist_simulated %>%
get_p_value(
obs_stat = observed_statistic,
direction = "two-sided"
)
p_value_simulated
#> # A tibble: 1 × 1
#> p_value
#> <dbl>
#> 1 0.17Created on 2022-08-15 by the reprex package (v2.0.1)

