Created
September 13, 2023 13:33
-
-
Save vankesteren/29ee2cd3ca0cb038ab338558bc91af60 to your computer and use it in GitHub Desktop.
Simple split & k-fold conformal prediction for linear regression model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Conformal prediction for regression prediction intervals | |
# see example 2.3.1 from https://arxiv.org/pdf/2107.07511.pdf | |
# goal: make a 90% prediction interval for linear model | |
# First simulate some data with non-normal residuals | |
set.seed(45) | |
sim_data <- function(N) { | |
x <- runif(N, min = -4, max = 4) | |
y <- x + rgamma(N, 2, 1) - 2 | |
data.frame(x = x, y = y) | |
} | |
df <- sim_data(1000) | |
# plot the data | |
plot( | |
x = df, | |
xlim = c(-4, 4), | |
ylim = c(-7, 9), | |
bty = "L", | |
col = "lightseagreen", | |
main = "Conformal prediction intervals" | |
) | |
mtext("For linear regression with non-normal residuals") | |
# theoretical prediction intervals of the true model | |
x_new <- seq(-4, 4, 0.01) | |
lines(x_new, x_new - 2 + qgamma(0.95, 2, 1)) | |
lines(x_new, x_new - 2 + qgamma(0.05, 2, 1)) | |
# test coverage should be 90% | |
coverage <- function(y, lower, upper) { | |
sum(y >= lower & y <= upper) / length(y) | |
} | |
df_test <- sim_data(1e7) | |
cov_true <- coverage( | |
y = df_test$y, | |
lower = df_test$x - 2 + qgamma(0.05, 2, 1), | |
upper = df_test$x - 2 + qgamma(0.95, 2, 1) | |
) | |
print(cov_true) | |
# fit linear model | |
fit <- lm(y ~ x, df) | |
# get prediction interval | |
pred <- predict(fit, newdata = data.frame(x = x_new), interval = "prediction", level = 0.9) | |
# plot | |
lines(x_new, pred[,2], lty = 2) | |
lines(x_new, pred[,3], lty = 2) | |
# test coverage | |
test_pred <- predict(fit, newdata = df_test, interval = "prediction", level = 0.9) | |
cov_lm <- coverage( | |
y = df_test$y, | |
lower = test_pred[,2], | |
upper = test_pred[,3] | |
) | |
print(cov_lm) | |
# Split conformal interval | |
id_train <- sample(1:nrow(df), 800) | |
df_train <- df[id_t,] | |
df_calib <- df[-id_t,] | |
fit_train <- lm(y ~ x, df_train) | |
scores <- df_calib$y - predict(fit_train, df_calib) | |
# plot the interval | |
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.95), lty = 3) | |
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.05), lty = 3) | |
# compute coverage | |
cov_conf <- coverage( | |
y = df_test$y, | |
upper = predict(fit, df_test) + quantile(scores, 0.95), | |
lower = predict(fit, df_test) + quantile(scores, 0.05) | |
) | |
print(cov_conf) | |
# K-fold cross-conformal interval | |
K <- 1000 | |
fold_id <- sample(rep(1:K, each = ceiling(1000/K)))[1:1000] | |
kscores <- numeric(1000) | |
for (k in 1:K) { | |
id_k <- fold_id == k | |
fit_k <- lm(y ~ x, df[!id_k,]) | |
kscores[id_k] <- df[id_k,]$y - predict(fit_k, df[id_k, ]) | |
} | |
cov_Kconf <- coverage( | |
y = df_test$y, | |
upper = predict(fit, df_test) + quantile(kscores, 0.95), | |
lower = predict(fit, df_test) + quantile(kscores, 0.05) | |
) | |
print(cov_Kconf) | |
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.95), lty = 4) | |
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.05), lty = 4) | |
legend("bottomright", c("true", "lm", "conf", "Kconf"), lty = 1:4) | |
# summary table for coverage | |
data.frame( | |
method = c("true", "lm", "conf", "Kconf"), | |
coverage = c(cov_true, cov_lm, cov_conf, cov_Kconf) | |
) | |
# conformal prediction for quantile regression | |
# example 2.2 | |
library(quantreg) | |
library(dplyr) | |
# first simulate some data | |
set.seed(45) | |
sim_data <- function(N = 1000) { | |
x <- runif(N, min = -4, max = 4) | |
y <- sin(x*1.5) + rnorm(N, sd = 0.3) | |
data.frame(x = x, y = y) | |
} | |
df <- sim_data(1000) | |
plot( | |
x = df, | |
xlim = c(-4, 4), | |
ylim = c(-2, 2), | |
bty = "n", | |
col = "lightgreen", | |
main = "Conformal 90% quantile regression" | |
) | |
mtext("With wrong quantile model") | |
# plot theoretical 5th and 95th quantiles | |
x_new <- seq(-4, 4, 0.01) | |
lines(x_new, sin(x_new * 1.5) + qnorm(0.95, sd = 0.3), lty = 1) | |
lines(x_new, sin(x_new * 1.5) + qnorm(0.05, sd = 0.3), lty = 1) | |
legend("topright", c("true (0.9)", "quant (0.6)", "conf (0.6)"), lty = c(1, 2, 3)) | |
# then estimate models for the 5th and 95th quantile | |
# NB: uses incorrect model! | |
model_lower <- rq(y ~ poly(x, 10), tau = 0.2, data = df) | |
model_upper <- rq(y ~ poly(x, 10), tau = 0.8, data = df) | |
# plot the estimated model | |
lines(x_new, predict(model_lower, data.frame(x = x_new)), lty = 2) | |
lines(x_new, predict(model_upper, data.frame(x = x_new)), lty = 2) | |
# Conformalize the interval using calibration data | |
df_c <- sim_data(300) | |
points(df_c, col = "lightblue", pch = 3, cex = 0.7) | |
s <- function(x, y) { | |
t_hat_lower <- predict(model_lower, data.frame(x = x)) | |
t_hat_upper <- predict(model_upper, data.frame(x = x)) | |
return(max(t_hat_lower - y, y - t_hat_upper)) | |
} | |
scores <- vapply( | |
X = 1:nrow(df_c), | |
FUN = \(i) s(df_c[i, 1], df_c[i, 2]), | |
FUN.VALUE = 0. | |
) | |
qhat <- quantile(scores, 0.9) | |
# plot conformalized intervals | |
lines(x_new, predict(model_lower, data.frame(x = x_new)) - qhat, lty = 3) | |
lines(x_new, predict(model_upper, data.frame(x = x_new)) + qhat, lty = 3) | |
Author
vankesteren
commented
Sep 13, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment