Skip to content

Instantly share code, notes, and snippets.

@tonyelhabr
Last active August 15, 2022 13:14
Show Gist options
  • Save tonyelhabr/f6d8007dc288d63e0e6c8b6dd6d71d6e to your computer and use it in GitHub Desktop.
Save tonyelhabr/f6d8007dc288d63e0e6c8b6dd6d71d6e to your computer and use it in GitHub Desktop.
Example nationality resids difference in means calculation
library(tidyverse)
library(infer)

## Generate some fake data that is similar to your actual data
set.seed(1)
resids <- tibble(
  nationality = c(rep('English', 60), rep('non-English', 100), rep('African', 40)),
  .resid = c(
    rnorm(60, sd = 2) + runif(60, 0, 0.5),
    rnorm(100, mean = 0.05, sd = 2) + runif(100, -3, 3),
    rnorm(40, mean = -0.1, sd = 1) + runif(40, -2, 1)
  )
)

resids |>
  ggplot(aes(x = nationality, y = .resid)) +
  geom_boxplot()

two_groups <- c("English", "non-English")
two_group_resids <- resids |>
  filter(nationality %in% two_groups) |>
  mutate(nationality = factor(nationality))

## This is the more traditional calculation of p-value, with the infer::t_test wrapper for stats::t.test.
p_value_traditional <- two_group_resids |>
  t_test(
    formula = .resid ~ nationality,
    order = two_groups,
    ## "two-sided" means we're testing for a statistically significant difference in general.
    ##   "greater" or "less" means we're interested specifically in one group having
    ##   a statistically significantly higher/lower average.
    alternative = "two-sided"
  )
## A p-value greater than 0.05 is telling us that there isn't a statistically significant difference
##   between the English and non-English residuals, meaning that we cannot
##   reasonably conclude that Fifa shooting ratings are biased favorably for English players.
p_value_traditional
#> # A tibble: 1 × 7
#>   statistic  t_df p_value alternative estimate lower_ci upper_ci
#>       <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>    <dbl>
#> 1      1.20  154.   0.233 two.sided      0.395   -0.257     1.05

## We can use a more generalizable approach to calculate the p-value, where we simulate data from the distribution
##   of the null hypothesis and essentially calculate the p-value as a percentile in
##   the simulated distribution. We should come to the same conclusion as we find
##   with the traditional approach, although our p-value will not be exactly the same.
observed_statistic <- two_group_resids |>
  specify(response = .resid, explanatory = nationality) |>
  calculate(stat = "diff in medians", order = two_groups)
observed_statistic
#> Response: .resid (numeric)
#> Explanatory: nationality (factor)
#> # A tibble: 1 × 1
#>    stat
#>   <dbl>
#> 1 0.473

null_dist_simulated <- two_group_resids |>
  specify(response = .resid, explanatory = nationality) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = two_groups)

## The red bar corresponds to the value of observed_statistic, the "t-stat" value 
null_dist_simulated %>%
  visualize() +
  shade_p_value(observed_statistic, direction = "two-sided")

## If observed_statistic were outside 2.5-97.5th percentile range, then the p-value would be less than 0.05.
c(quantile(null_dist_simulated$stat, 0.025), quantile(null_dist_simulated$stat, 0.975))
#>       2.5%      97.5% 
#> -0.6629786  0.6702365

p_value_simulated <- null_dist_simulated %>%
  get_p_value(
    obs_stat = observed_statistic,
    direction = "two-sided"
  )
p_value_simulated
#> # A tibble: 1 × 1
#>   p_value
#>     <dbl>
#> 1    0.17

Created on 2022-08-15 by the reprex package (v2.0.1)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment