Emperical bayes estimation of xG overperformance

This is my attempt to replicate the analysis by Laurie Shaw here.

Creating some fake data.

library(tibble)
library(purrr)
library(tidyr)
library(MASS)
library(ggplot2)
library(forcats)

n_players <- 100
max_goals <- 30

set.seed(1)
df <- tibble(
  player_id = 1:n_players,
  g = sample(
    0:max_goals,
    size = 100,
    prob = c(1, max_goals:1) / sum(c(1, max_goals:1)),
    replace = TRUE
  )
) |>
  mutate(
    shots = round((4 + rnorm(n(), mean = 0, sd = 1)) * g),
    xg = pmax(g + rnorm(n(), mean = 0, sd = shots / 30), 0),
    ## intentionally deflate performance of lower shot takers, to emulate lower shot conversion rates of defenders, DMs, etc.
    xg = ifelse(shots < 50, 1.1 * xg, xg),
    raw_g_xg_ratio = g / xg
  )
df
#> # A tibble: 100 × 5
#>    player_id     g shots    xg raw_g_xg_ratio
#>        <int> <int> <dbl> <dbl>          <dbl>
#>  1         1     5    22  5.86          0.853
#>  2         2     7    24  7.68          0.911
#>  3         3    11    48 11.5           0.953
#>  4         4    22    63 20.0           1.10 
#>  5         5     4    22  3.20          1.25 
#>  6         6    21   126 16.5           1.27 
#>  7         7    24    87 26.9           0.892
#>  8         8    13    38 13.4           0.968
#>  9         9    12    55  9.46          1.27 
#> 10        10     1     4  1.37          0.728
#> # … with 90 more rows

Define a minimum shot number to use to estimate the prior distribution. Laurie used 50.

shot_threshold <- 50
prior_df <- filter(df, shots >= shot_threshold)

## Ignore the warning here.
prior_distr <- fitdistr(
  prior_df$raw_g_xg_ratio,
  dgamma,
  start = list(shape = 1, rate = 1)
)
prior_distr
#>      shape       rate   
#>   39.882576   38.473156 
#>  ( 9.234005) ( 8.963809)
prior_shape <- prior_distr$estimate[1]
prior_rate <- prior_distr$estimate[2]

This is the part that I'm not sure is right, but the results look reasonable 🤷

simulate_posterior <- function(successes, trials, n_sims = 10000) {
  posterior_shape <- prior_shape + successes
  posterior_rate <- prior_rate + trials
  posterior_sample <- rgamma(n = n_sims, shape = posterior_shape, rate = posterior_rate)
  list(
    mean = mean(posterior_sample),
    sd = sd(posterior_sample)
  )
}

df$adj_g_xg_ratio <- map2(
  df$g, df$xg,
  simulate_posterior
)

res <- unnest_wider(df, adj_g_xg_ratio, names_sep = '_')
res
#> # A tibble: 100 × 7
#>    player_id     g shots    xg raw_g_xg_ratio adj_g_xg_ratio_mean adj_g_xg_rat…¹
#>        <int> <int> <dbl> <dbl>          <dbl>               <dbl>          <dbl>
#>  1         1     5    22  5.86          0.853               1.01           0.152
#>  2         2     7    24  7.68          0.911               1.02           0.148
#>  3         3    11    48 11.5           0.953               1.02           0.144
#>  4         4    22    63 20.0           1.10                1.06           0.134
#>  5         5     4    22  3.20          1.25                1.05           0.157
#>  6         6    21   126 16.5           1.27                1.11           0.141
#>  7         7    24    87 26.9           0.892               0.975          0.122
#>  8         8    13    38 13.4           0.968               1.02           0.140
#>  9         9    12    55  9.46          1.27                1.09           0.149
#> 10        10     1     4  1.37          0.728               1.03           0.159
#> # … with 90 more rows, and abbreviated variable name ¹adj_g_xg_ratio_sd

Visual of shrinkage

res |>
  ggplot() +
  geom_point(
    aes(
      x = raw_g_xg_ratio,
      y = adj_g_xg_ratio_mean,
      size = shots
    )
  ) +
  geom_vline(aes(xintercept = 1)) +
  geom_hline(aes(yintercept = 1)) +
  geom_abline(aes(slope = 1, intercept = 0)) +
  coord_cartesian(
    xlim = c(0.7, 1.3),
    y = c(0.7, 1.3)
  ) +
  labs(title = 'Shrinkage go brrr')

Plot akin to Laurie's plot

res |>
  slice_max(adj_g_xg_ratio_mean, n = 20) |>
  mutate(player_id = fct_reorder(as.character(player_id), adj_g_xg_ratio_mean)) |>
  ggplot() +
  aes(y = player_id) +
  geom_point(aes(x = adj_g_xg_ratio_mean)) +
  geom_errorbarh(
    aes(
      xmin = adj_g_xg_ratio_mean - adj_g_xg_ratio_sd,
      xmax = adj_g_xg_ratio_mean + adj_g_xg_ratio_sd
    )
  ) +
  geom_vline(aes(xintercept = 1)) +
  labs(title = 'Top overperformers')

tonyelhabr/overperformance.md