This is my attempt to replicate the analysis by Laurie Shaw here.
Creating some fake data.
library(tibble)
library(purrr)
library(tidyr)
library(MASS)
library(ggplot2)
library(forcats)
n_players <- 100
max_goals <- 30
set.seed(1)
df <- tibble(
player_id = 1:n_players,
g = sample(
0:max_goals,
size = 100,
prob = c(1, max_goals:1) / sum(c(1, max_goals:1)),
replace = TRUE
)
) |>
mutate(
shots = round((4 + rnorm(n(), mean = 0, sd = 1)) * g),
xg = pmax(g + rnorm(n(), mean = 0, sd = shots / 30), 0),
## intentionally deflate performance of lower shot takers, to emulate lower shot conversion rates of defenders, DMs, etc.
xg = ifelse(shots < 50, 1.1 * xg, xg),
raw_g_xg_ratio = g / xg
)
df
#> # A tibble: 100 × 5
#> player_id g shots xg raw_g_xg_ratio
#> <int> <int> <dbl> <dbl> <dbl>
#> 1 1 5 22 5.86 0.853
#> 2 2 7 24 7.68 0.911
#> 3 3 11 48 11.5 0.953
#> 4 4 22 63 20.0 1.10
#> 5 5 4 22 3.20 1.25
#> 6 6 21 126 16.5 1.27
#> 7 7 24 87 26.9 0.892
#> 8 8 13 38 13.4 0.968
#> 9 9 12 55 9.46 1.27
#> 10 10 1 4 1.37 0.728
#> # … with 90 more rows
Define a minimum shot number to use to estimate the prior distribution. Laurie used 50.
shot_threshold <- 50
prior_df <- filter(df, shots >= shot_threshold)
## Ignore the warning here.
prior_distr <- fitdistr(
prior_df$raw_g_xg_ratio,
dgamma,
start = list(shape = 1, rate = 1)
)
prior_distr
#> shape rate
#> 39.882576 38.473156
#> ( 9.234005) ( 8.963809)
prior_shape <- prior_distr$estimate[1]
prior_rate <- prior_distr$estimate[2]
This is the part that I'm not sure is right, but the results look reasonable 🤷
simulate_posterior <- function(successes, trials, n_sims = 10000) {
posterior_shape <- prior_shape + successes
posterior_rate <- prior_rate + trials
posterior_sample <- rgamma(n = n_sims, shape = posterior_shape, rate = posterior_rate)
list(
mean = mean(posterior_sample),
sd = sd(posterior_sample)
)
}
df$adj_g_xg_ratio <- map2(
df$g, df$xg,
simulate_posterior
)
res <- unnest_wider(df, adj_g_xg_ratio, names_sep = '_')
res
#> # A tibble: 100 × 7
#> player_id g shots xg raw_g_xg_ratio adj_g_xg_ratio_mean adj_g_xg_rat…¹
#> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 5 22 5.86 0.853 1.01 0.152
#> 2 2 7 24 7.68 0.911 1.02 0.148
#> 3 3 11 48 11.5 0.953 1.02 0.144
#> 4 4 22 63 20.0 1.10 1.06 0.134
#> 5 5 4 22 3.20 1.25 1.05 0.157
#> 6 6 21 126 16.5 1.27 1.11 0.141
#> 7 7 24 87 26.9 0.892 0.975 0.122
#> 8 8 13 38 13.4 0.968 1.02 0.140
#> 9 9 12 55 9.46 1.27 1.09 0.149
#> 10 10 1 4 1.37 0.728 1.03 0.159
#> # … with 90 more rows, and abbreviated variable name ¹adj_g_xg_ratio_sd
Visual of shrinkage
res |>
ggplot() +
geom_point(
aes(
x = raw_g_xg_ratio,
y = adj_g_xg_ratio_mean,
size = shots
)
) +
geom_vline(aes(xintercept = 1)) +
geom_hline(aes(yintercept = 1)) +
geom_abline(aes(slope = 1, intercept = 0)) +
coord_cartesian(
xlim = c(0.7, 1.3),
y = c(0.7, 1.3)
) +
labs(title = 'Shrinkage go brrr')
Plot akin to Laurie's plot
res |>
slice_max(adj_g_xg_ratio_mean, n = 20) |>
mutate(player_id = fct_reorder(as.character(player_id), adj_g_xg_ratio_mean)) |>
ggplot() +
aes(y = player_id) +
geom_point(aes(x = adj_g_xg_ratio_mean)) +
geom_errorbarh(
aes(
xmin = adj_g_xg_ratio_mean - adj_g_xg_ratio_sd,
xmax = adj_g_xg_ratio_mean + adj_g_xg_ratio_sd
)
) +
geom_vline(aes(xintercept = 1)) +
labs(title = 'Top overperformers')