library(tidyverse)
# load in source_rmd()
devtools::source_gist("https://gist.github.com/noamross/a549ee50e8a4fd68b8b1")
#> Sourcing https://gist.githubusercontent.com/noamross/a549ee50e8a4fd68b8b1/raw/40960d8280438b7e2e8d6502f3c1d4ad348caeb6/source_rmd.R
#> SHA-1 hash of file is 624ae941e51dee522994e014928448170cedf1a3
# Also could have used solution here:
# https://stackoverflow.com/questions/10966109/how-to-source-r-markdown-file-like-sourcemyfile-r
# Source code from blog post
source_rmd("https://raw.githubusercontent.com/brshallo/brshallo/master/content/post/2020-11-23-remember-resampling-techniques-change-the-base-rates-of-your-predictions.Rmd", skip_plots = TRUE)
#> processing file: https://raw.githubusercontent.com/brshallo/brshallo/master/content/post/2020-11-23-remember-resampling-techniques-change-the-base-rates-of-your-predictions.Rmd
#> output file: C:\Users\BSHALLOW\AppData\Local\Temp\RtmpwPXH3e\file48706e763d0e.R
classadjust <- function(condprobs, wrongprob, trueprob) {
a <- condprobs / (wrongprob / trueprob)
comp_cond <- 1 - condprobs
comp_wrong <- 1 - wrongprob
comp_true <- 1 - trueprob
b <- comp_cond / (comp_wrong / comp_true)
return(a / (a + b))
}
offset_intercept <- function(true_baserate, sample_baserate){
log((base_rate / (1 - base_rate)) * ((1 - sample_baserate) / sample_baserate))
}
lodds_to_prob <- function(x) exp(x) / (exp(x) + 1)
base_rate <- summarise(train, prob = sum(target) / n()) %>% pull(prob)
offset <- offset_intercept(base_rate, 0.5)
# Different approaches to rescaling predictions (platt scaling, offset, adjust)
test_preds_scaling_approaches <- test %>%
# preds when no resampling
modelr::spread_predictions(mod_5_95, mod_50_50) %>%
mutate(pred = mod_50_50) %>%
# preds when resampling then platt scaling
spread_predictions(mod_50_50_rescaled_calibrated) %>%
select(-pred) %>%
# preds when using intercept offset
mutate(mod50_offset = mod_50_50 + offset) %>%
mutate(across(contains("mod"), list(pred = convert_lodds))) %>%
# preds when adjusting
mutate(mod50_adjust_pred = classadjust(mod_50_50_pred, 0.50, base_rate)) %>%
rename(
formula_adjusted = mod50_adjust_pred,
offset = mod50_offset_pred,
platt_scaled = mod_50_50_rescaled_calibrated_pred,
unaltered = mod_5_95_pred
) %>%
# `feature` just represented the probability of an event being TRUE, so
# can just convert to probability and now serves as the actual
mutate(actual = lodds_to_prob(feature))
# All plotted and compared against preds when not doing any adjustment
test_preds_scaling_approaches %>%
ggplot(aes(x = feature))+
geom_line(aes(y = formula_adjusted, colour = "adjusted (after downsample)"))+
geom_line(aes(y = offset, colour = "offset (after downsample)"), linetype = "dashed")+
geom_line(aes(y = platt_scaled, colour = "platt scaled (after downsample)"))+
geom_line(aes(y = unaltered, colour = "unaltered (no downsampling)"), linetype = "dashed")+
geom_line(aes(y = actual, colour = "actual probability"))+
labs(y = "predicted probability",
colour = NULL,
caption =
"In this univariate example 'platt scaled' produces the same results as when doing no resampling.
\nIs closer to actual probability at tail compared to offset/adjusted approach (in this case).")+
theme(legend.position = "bottom")+
guides(colour = guide_legend(ncol = 2))
Seems like an approach I've used for visualizing relationships between likelihoods and categories my Seems that this approach may not be perfect at the tails of the distribution...
actuals_data <- test_preds_scaling_approaches %>%
mutate(ntile = ntile(feature, 50)) %>%
group_by(ntile) %>%
mutate(feature = median(feature)) %>%
group_by(feature) %>%
summarise(actual_observed = sum(target) / n())
#> `summarise()` ungrouping output (override with `.groups` argument)
actuals_data %>%
ggplot(aes(x = feature, y = actual_observed))+
geom_line()+
geom_smooth()+
geom_line(aes(y = actual, colour = "actual"), data = test_preds_scaling_approaches)
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Have used this or similar binning approaches in other places, e.g. https://stats.stackexchange.com/a/391125/193123
Created on 2020-12-04 by the reprex package (v0.3.0)
Perhaps should also add reprex on isotonic regression...