vankesteren · September 13, 2023 13:33 · vankesteren · Sep 13, 2023 · vankesteren · Sep 13, 2023
diff --git a/split_kfold_conformal_lm.R b/split_kfold_conformal_lm.R
 # Conformal prediction for regression prediction intervals
 # see example 2.3.1 from https://arxiv.org/pdf/2107.07511.pdf
 # goal: make a 90% prediction interval for linear model

 # First simulate some data with non-normal residuals
 set.seed(45)
 sim_data <- function(N) {
  x <- runif(N, min = -4, max = 4)
  y <- x + rgamma(N, 2, 1) - 2
  data.frame(x = x, y = y)
 }
 df <- sim_data(1000)

 # plot the data
 plot(
  x    = df,
  xlim = c(-4, 4),
  ylim = c(-7, 9),
  bty  = "L",
  col  = "lightseagreen",
  main = "Conformal prediction intervals"
 )
 mtext("For linear regression with non-normal residuals")

 # theoretical prediction intervals of the true model
 x_new <- seq(-4, 4, 0.01)
 lines(x_new, x_new - 2 + qgamma(0.95, 2, 1))
 lines(x_new, x_new - 2 + qgamma(0.05, 2, 1))

 # test coverage should be 90%
 coverage <- function(y, lower, upper) {
  sum(y >= lower & y <= upper) / length(y)
 }
 df_test <- sim_data(1e7)
 cov_true <- coverage(
  y = df_test$y,
  lower = df_test$x - 2 + qgamma(0.05, 2, 1), 
  upper = df_test$x - 2 + qgamma(0.95, 2, 1)
 )
 print(cov_true)

 # fit linear model
 fit <- lm(y ~ x, df)

 # get prediction interval
 pred <- predict(fit, newdata = data.frame(x = x_new), interval = "prediction", level = 0.9)

 # plot
 lines(x_new, pred[,2], lty = 2)
 lines(x_new, pred[,3], lty = 2)

 # test coverage
 test_pred <- predict(fit, newdata = df_test, interval = "prediction", level = 0.9)
 cov_lm <- coverage(
  y = df_test$y,
  lower = test_pred[,2],
  upper = test_pred[,3]
 )
 print(cov_lm)

 # Split conformal interval
 id_train <- sample(1:nrow(df), 800)
 df_train <- df[id_t,]
 df_calib <- df[-id_t,]
 fit_train <- lm(y ~ x, df_train)
 scores <- df_calib$y - predict(fit_train, df_calib)

 # plot the interval
 lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.95), lty = 3)
 lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.05), lty = 3)

 # compute coverage
 cov_conf <- coverage(
  y = df_test$y,
  upper = predict(fit, df_test) + quantile(scores, 0.95),
  lower = predict(fit, df_test) + quantile(scores, 0.05)
 )
 print(cov_conf)

 # K-fold cross-conformal interval
 K <- 1000
 fold_id <- sample(rep(1:K, each = ceiling(1000/K)))[1:1000]
 kscores <- numeric(1000)
 for (k in 1:K) {
  id_k  <- fold_id == k
  fit_k <- lm(y ~ x, df[!id_k,])
  kscores[id_k] <- df[id_k,]$y - predict(fit_k, df[id_k, ])
 }

 cov_Kconf <- coverage(
  y = df_test$y,
  upper = predict(fit, df_test) + quantile(kscores, 0.95),
  lower = predict(fit, df_test) + quantile(kscores, 0.05)
 )
 print(cov_Kconf)


 lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.95), lty = 4)
 lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.05), lty = 4)


 legend("bottomright", c("true", "lm", "conf", "Kconf"), lty = 1:4)

 # summary table for coverage
 data.frame(
  method = c("true", "lm", "conf", "Kconf"), 
  coverage = c(cov_true, cov_lm, cov_conf, cov_Kconf)
 )



 # conformal prediction for quantile regression
 # example 2.2
 library(quantreg)
 library(dplyr)

 # first simulate some data
 set.seed(45)
 sim_data <- function(N = 1000) {
  x <- runif(N, min = -4, max = 4)
  y <- sin(x*1.5) + rnorm(N, sd = 0.3)
  data.frame(x = x, y = y)
 } 
 df <- sim_data(1000)
 plot(
  x    = df,
  xlim = c(-4, 4),
  ylim = c(-2, 2),
  bty  = "n",
  col  = "lightgreen",
  main = "Conformal 90% quantile regression"
 )
 mtext("With wrong quantile model")

 # plot theoretical 5th and 95th quantiles
 x_new <- seq(-4, 4, 0.01)
 lines(x_new, sin(x_new * 1.5) + qnorm(0.95, sd = 0.3), lty = 1)
 lines(x_new, sin(x_new * 1.5) + qnorm(0.05, sd = 0.3), lty = 1)

 legend("topright", c("true (0.9)", "quant (0.6)", "conf (0.6)"), lty = c(1, 2, 3))

 # then estimate models for the 5th and 95th quantile
 # NB: uses incorrect model!
 model_lower <- rq(y ~ poly(x, 10), tau = 0.2, data = df)
 model_upper <- rq(y ~ poly(x, 10), tau = 0.8, data = df)

 # plot the estimated model
 lines(x_new, predict(model_lower, data.frame(x = x_new)), lty = 2)
 lines(x_new, predict(model_upper, data.frame(x = x_new)), lty = 2)


 # Conformalize the interval using calibration data
 df_c <- sim_data(300)
 points(df_c, col = "lightblue", pch = 3, cex = 0.7)

 s <- function(x, y) {
  t_hat_lower <- predict(model_lower, data.frame(x = x))
  t_hat_upper <- predict(model_upper, data.frame(x = x))
  return(max(t_hat_lower - y, y - t_hat_upper))
 }

 scores <- vapply(
  X = 1:nrow(df_c),
  FUN = \(i) s(df_c[i, 1], df_c[i, 2]),
  FUN.VALUE = 0.
 )

 qhat <- quantile(scores, 0.9)

 # plot conformalized intervals
 lines(x_new, predict(model_lower, data.frame(x = x_new)) - qhat, lty = 3)
 lines(x_new, predict(model_upper, data.frame(x = x_new)) + qhat, lty = 3)
	# Conformal prediction for regression prediction intervals
	# see example 2.3.1 from https://arxiv.org/pdf/2107.07511.pdf
	# goal: make a 90% prediction interval for linear model

	# First simulate some data with non-normal residuals
	set.seed(45)
	sim_data <- function(N) {
	x <- runif(N, min = -4, max = 4)
	y <- x + rgamma(N, 2, 1) - 2
	data.frame(x = x, y = y)
	}
	df <- sim_data(1000)

	# plot the data
	plot(
	x = df,
	xlim = c(-4, 4),
	ylim = c(-7, 9),
	bty = "L",
	col = "lightseagreen",
	main = "Conformal prediction intervals"
	)
	mtext("For linear regression with non-normal residuals")

	# theoretical prediction intervals of the true model
	x_new <- seq(-4, 4, 0.01)
	lines(x_new, x_new - 2 + qgamma(0.95, 2, 1))
	lines(x_new, x_new - 2 + qgamma(0.05, 2, 1))

	# test coverage should be 90%
	coverage <- function(y, lower, upper) {
	sum(y >= lower & y <= upper) / length(y)
	}
	df_test <- sim_data(1e7)
	cov_true <- coverage(
	y = df_test$y,
	lower = df_test$x - 2 + qgamma(0.05, 2, 1),
	upper = df_test$x - 2 + qgamma(0.95, 2, 1)
	)
	print(cov_true)

	# fit linear model
	fit <- lm(y ~ x, df)

	# get prediction interval
	pred <- predict(fit, newdata = data.frame(x = x_new), interval = "prediction", level = 0.9)

	# plot
	lines(x_new, pred[,2], lty = 2)
	lines(x_new, pred[,3], lty = 2)

	# test coverage
	test_pred <- predict(fit, newdata = df_test, interval = "prediction", level = 0.9)
	cov_lm <- coverage(
	y = df_test$y,
	lower = test_pred[,2],
	upper = test_pred[,3]
	)
	print(cov_lm)

	# Split conformal interval
	id_train <- sample(1:nrow(df), 800)
	df_train <- df[id_t,]
	df_calib <- df[-id_t,]
	fit_train <- lm(y ~ x, df_train)
	scores <- df_calib$y - predict(fit_train, df_calib)

	# plot the interval
	lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.95), lty = 3)
	lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(scores, 0.05), lty = 3)

	# compute coverage
	cov_conf <- coverage(
	y = df_test$y,
	upper = predict(fit, df_test) + quantile(scores, 0.95),
	lower = predict(fit, df_test) + quantile(scores, 0.05)
	)
	print(cov_conf)

	# K-fold cross-conformal interval
	K <- 1000
	fold_id <- sample(rep(1:K, each = ceiling(1000/K)))[1:1000]
	kscores <- numeric(1000)
	for (k in 1:K) {
	id_k <- fold_id == k
	fit_k <- lm(y ~ x, df[!id_k,])
	kscores[id_k] <- df[id_k,]$y - predict(fit_k, df[id_k, ])
	}

	cov_Kconf <- coverage(
	y = df_test$y,
	upper = predict(fit, df_test) + quantile(kscores, 0.95),
	lower = predict(fit, df_test) + quantile(kscores, 0.05)
	)
	print(cov_Kconf)


	lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.95), lty = 4)
	lines(x_new, predict(fit, data.frame(x = x_new)) + quantile(kscores, 0.05), lty = 4)


	legend("bottomright", c("true", "lm", "conf", "Kconf"), lty = 1:4)

	# summary table for coverage
	data.frame(
	method = c("true", "lm", "conf", "Kconf"),
	coverage = c(cov_true, cov_lm, cov_conf, cov_Kconf)
	)



	# conformal prediction for quantile regression
	# example 2.2
	library(quantreg)
	library(dplyr)

	# first simulate some data
	set.seed(45)
	sim_data <- function(N = 1000) {
	x <- runif(N, min = -4, max = 4)
	y <- sin(x*1.5) + rnorm(N, sd = 0.3)
	data.frame(x = x, y = y)
	}
	df <- sim_data(1000)
	plot(
	x = df,
	xlim = c(-4, 4),
	ylim = c(-2, 2),
	bty = "n",
	col = "lightgreen",
	main = "Conformal 90% quantile regression"
	)
	mtext("With wrong quantile model")

	# plot theoretical 5th and 95th quantiles
	x_new <- seq(-4, 4, 0.01)
	lines(x_new, sin(x_new * 1.5) + qnorm(0.95, sd = 0.3), lty = 1)
	lines(x_new, sin(x_new * 1.5) + qnorm(0.05, sd = 0.3), lty = 1)

	legend("topright", c("true (0.9)", "quant (0.6)", "conf (0.6)"), lty = c(1, 2, 3))

	# then estimate models for the 5th and 95th quantile
	# NB: uses incorrect model!
	model_lower <- rq(y ~ poly(x, 10), tau = 0.2, data = df)
	model_upper <- rq(y ~ poly(x, 10), tau = 0.8, data = df)

	# plot the estimated model
	lines(x_new, predict(model_lower, data.frame(x = x_new)), lty = 2)
	lines(x_new, predict(model_upper, data.frame(x = x_new)), lty = 2)


	# Conformalize the interval using calibration data
	df_c <- sim_data(300)
	points(df_c, col = "lightblue", pch = 3, cex = 0.7)

	s <- function(x, y) {
	t_hat_lower <- predict(model_lower, data.frame(x = x))
	t_hat_upper <- predict(model_upper, data.frame(x = x))
	return(max(t_hat_lower - y, y - t_hat_upper))
	}

	scores <- vapply(
	X = 1:nrow(df_c),
	FUN = \(i) s(df_c[i, 1], df_c[i, 2]),
	FUN.VALUE = 0.
	)

	qhat <- quantile(scores, 0.9)

	# plot conformalized intervals
	lines(x_new, predict(model_lower, data.frame(x = x_new)) - qhat, lty = 3)
	lines(x_new, predict(model_upper, data.frame(x = x_new)) + qhat, lty = 3)