Skip to content

Instantly share code, notes, and snippets.

@vankesteren
Created September 13, 2023 13:33
Show Gist options
  • Save vankesteren/29ee2cd3ca0cb038ab338558bc91af60 to your computer and use it in GitHub Desktop.
Save vankesteren/29ee2cd3ca0cb038ab338558bc91af60 to your computer and use it in GitHub Desktop.
Simple split & k-fold conformal prediction for linear regression model
# Conformal prediction for regression prediction intervals
# see example 2.3.1 from https://arxiv.org/pdf/2107.07511.pdf
# goal: make a 90% prediction interval for linear model
# First simulate some data with non-normal residuals
set.seed(45)
sim_data <- function(N) {
x <- runif(N, min = -4, max = 4)
y <- x + rgamma(N, 2, 1) - 2
data.frame(x = x, y = y)
}
df <- sim_data(1000)
# plot the data
plot(
x = df,
xlim = c(-4, 4),
ylim = c(-7, 9),
bty = "L",
col = "lightseagreen",
main = "Conformal prediction intervals"
)
mtext("For linear regression with non-normal residuals")
# theoretical prediction intervals of the true model
x_new <- seq(-4, 4, 0.01)
lines(x_new, x_new - 2 + qgamma(0.95, 2, 1))
lines(x_new, x_new - 2 + qgamma(0.05, 2, 1))
# test coverage should be 90%
coverage <- function(y, lower, upper) {
sum(y >= lower & y <= upper) / length(y)
}
df_test <- sim_data(1e7)
cov_true <- coverage(
y = df_test$y,
lower = df_test$x - 2 + qgamma(0.05, 2, 1),
upper = df_test$x - 2 + qgamma(0.95, 2, 1)
)
print(cov_true)
# fit linear model
fit <- lm(y ~ x, df)
# get prediction interval
pred <- predict(fit, newdata = data.frame(x = x_new), interval = "prediction", level = 0.9)
# plot
lines(x_new, pred[,2], lty = 2)
lines(x_new, pred[,3], lty = 2)
# test coverage
test_pred <- predict(fit, newdata = df_test, interval = "prediction", level = 0.9)
cov_lm <- coverage(
y = df_test$y,
lower = test_pred[,2],
upper = test_pred[,3]
)
print(cov_lm)
# Split conformal interval
id_train <- sample(1:nrow(df), 800)
df_train <- df[id_t,]
df_calib <- df[-id_t,]
fit_train <- lm(y ~ x, df_train)
scores <- df_calib$y - predict(fit_train, df_calib)
# plot the interval
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.95), lty = 3)
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.05), lty = 3)
# compute coverage
cov_conf <- coverage(
y = df_test$y,
upper = predict(fit, df_test) + quantile(scores, 0.95),
lower = predict(fit, df_test) + quantile(scores, 0.05)
)
print(cov_conf)
# K-fold cross-conformal interval
K <- 1000
fold_id <- sample(rep(1:K, each = ceiling(1000/K)))[1:1000]
kscores <- numeric(1000)
for (k in 1:K) {
id_k <- fold_id == k
fit_k <- lm(y ~ x, df[!id_k,])
kscores[id_k] <- df[id_k,]$y - predict(fit_k, df[id_k, ])
}
cov_Kconf <- coverage(
y = df_test$y,
upper = predict(fit, df_test) + quantile(kscores, 0.95),
lower = predict(fit, df_test) + quantile(kscores, 0.05)
)
print(cov_Kconf)
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.95), lty = 4)
lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.05), lty = 4)
legend("bottomright", c("true", "lm", "conf", "Kconf"), lty = 1:4)
# summary table for coverage
data.frame(
method = c("true", "lm", "conf", "Kconf"),
coverage = c(cov_true, cov_lm, cov_conf, cov_Kconf)
)
# conformal prediction for quantile regression
# example 2.2
library(quantreg)
library(dplyr)
# first simulate some data
set.seed(45)
sim_data <- function(N = 1000) {
x <- runif(N, min = -4, max = 4)
y <- sin(x*1.5) + rnorm(N, sd = 0.3)
data.frame(x = x, y = y)
}
df <- sim_data(1000)
plot(
x = df,
xlim = c(-4, 4),
ylim = c(-2, 2),
bty = "n",
col = "lightgreen",
main = "Conformal 90% quantile regression"
)
mtext("With wrong quantile model")
# plot theoretical 5th and 95th quantiles
x_new <- seq(-4, 4, 0.01)
lines(x_new, sin(x_new * 1.5) + qnorm(0.95, sd = 0.3), lty = 1)
lines(x_new, sin(x_new * 1.5) + qnorm(0.05, sd = 0.3), lty = 1)
legend("topright", c("true (0.9)", "quant (0.6)", "conf (0.6)"), lty = c(1, 2, 3))
# then estimate models for the 5th and 95th quantile
# NB: uses incorrect model!
model_lower <- rq(y ~ poly(x, 10), tau = 0.2, data = df)
model_upper <- rq(y ~ poly(x, 10), tau = 0.8, data = df)
# plot the estimated model
lines(x_new, predict(model_lower, data.frame(x = x_new)), lty = 2)
lines(x_new, predict(model_upper, data.frame(x = x_new)), lty = 2)
# Conformalize the interval using calibration data
df_c <- sim_data(300)
points(df_c, col = "lightblue", pch = 3, cex = 0.7)
s <- function(x, y) {
t_hat_lower <- predict(model_lower, data.frame(x = x))
t_hat_upper <- predict(model_upper, data.frame(x = x))
return(max(t_hat_lower - y, y - t_hat_upper))
}
scores <- vapply(
X = 1:nrow(df_c),
FUN = \(i) s(df_c[i, 1], df_c[i, 2]),
FUN.VALUE = 0.
)
qhat <- quantile(scores, 0.9)
# plot conformalized intervals
lines(x_new, predict(model_lower, data.frame(x = x_new)) - qhat, lty = 3)
lines(x_new, predict(model_upper, data.frame(x = x_new)) + qhat, lty = 3)
@vankesteren
Copy link
Author

image

@vankesteren
Copy link
Author

image

@vankesteren
Copy link
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment