set.seed(123)
# Number of predictors
P <- 10
# Sample size
N <- 10000
# Generate variance covariance matrix as toeplitz
V <- toeplitz(P:1/P)
# Relative size of coefficients
b <- 1:P
# Set coefficient of determination
R2 <- 0.5
# Calculate regression coefficients
b <- b * c(sqrt(R2 / b %*% V %*% b))
# Generate data
X <- matrix(rnorm(N * P), nrow = N, ncol = P) %*% chol(V)
y <- X %*% b + rnorm(N, 0, sqrt(1-R2))
# Collect data in data frame
df <- data.frame(X = X, y = y)
# Induce missings
amp <- mice::ampute(
df,
prop = 0.5,
patterns = 1 - lower.tri(matrix(1, P, P+1), TRUE),
mech = "MCAR"
)$amp
# Perform regression imputation
predimp <- mice::mice(
amp,
m = 1,
method = "norm.predict",
predictorMatrix = cbind(1 - diag(nrow = P+1, ncol = P), 0),
print = FALSE
)
# Perform multiple imputation
miimp <- mice::mice(
amp, m = 10, maxit = 10, method = "norm", print = FALSE
)
# Fit quantile regression model with regression imputation data
predfit <- with(
predimp, quantreg::rq(y~X.1 + X.2 + X.3 + X.4 + X.5, tau = c(0.1, 0.5, 0.9))
)
# Calculate prediction interval for y
predint <- predfit$analyses[[1]] |> predict()
# Fit quantile regression model with multiple imputation data
mifit <- with(
miimp, quantreg::rq(y~X.1 + X.2 + X.3 + X.4 + X.5, tau = c(0.1, 0.5, 0.9))
)
# Calculate quantile regression prediction interval
miint <- lapply(mifit$analyses, predict) |>
unlist() |>
array(dim = c(N, 3, 10)) |>
apply(c(1,2), mean)
# Plot predicted versus observed y values with corresponding prediction intervals
# under regression imputation
predorder <- order(predint[,2])
plot(predint[predorder,2], y[predorder])
points(predint[predorder, 2][rowSums(is.na(amp)) >= 4],
y[predorder][rowSums(is.na(amp)) >= 4], col = "red")
segments(x0 = predint[predorder, 2],
y0 = predint[predorder, 1],
y1 = predint[predorder, 3],
col = "lightblue")# Evaluate marginal and conditional coverage of prediction intervals with
# corresponding interval width under regression imputation
cbind(amp, predint) |>
dplyr::mutate(sumna = rowSums(is.na(amp))) |>
dplyr::group_by(sumna) |>
dplyr::summarise(pred = mean(`2`),
cov = mean(`1` < y & y < `3`),
width = mean(`3` - `1`)) |>
dplyr::bind_rows(data.frame(sumna = 99,
pred = mean(predint[,2]),
cov = mean(predint[,1] < y & y < predint[,3]),
width = mean(predint[,3] - predint[,1])))
#> # A tibble: 12 × 4
#> sumna pred cov width
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0 0.00322 0.814 2.14
#> 2 1 0.0613 0.841 2.14
#> 3 2 0.0243 0.828 2.14
#> 4 3 -0.00620 0.815 2.14
#> 5 4 0.0278 0.821 2.14
#> 6 5 0.0125 0.831 2.14
#> 7 6 0.0319 0.784 2.14
#> 8 7 0.0303 0.760 2.14
#> 9 8 -0.00505 0.751 2.14
#> 10 9 0.0228 0.711 2.14
#> 11 10 0.0103 0.716 2.14
#> 12 99 0.0122 0.799 2.14
# Plot predicted versus observed y values with corresponding prediction intervals
# under multiple imputation
predorder <- order(miint[,2])
plot(miint[predorder,2], y[predorder])
points(miint[predorder, 2][rowSums(is.na(amp)) >= 4],
y[predorder][rowSums(is.na(amp)) >= 4], col = "red")
segments(x0 = miint[predorder, 2],
y0 = miint[predorder, 1],
y1 = miint[predorder, 3],
col = "lightblue")# Evaluate marginal and conditional coverage of prediction intervals with
# corresponding interval width under multiple imputation
cbind(amp, miint) |>
dplyr::mutate(sumna = rowSums(is.na(amp))) |>
dplyr::group_by(sumna) |>
dplyr::summarise(pred = mean(`2`),
cov = mean(`1` < y & y < `3`),
width = mean(`3` - `1`)) |>
dplyr::bind_rows(data.frame(sumna = 99,
pred = mean(miint[,2]),
cov = mean(miint[,1] < y & y < miint[,3]),
width = mean(miint[,3] - miint[,1])))
#> # A tibble: 12 × 4
#> sumna pred cov width
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0 0.00556 0.811 2.11
#> 2 1 0.0635 0.839 2.11
#> 3 2 0.0239 0.824 2.11
#> 4 3 -0.00259 0.801 2.11
#> 5 4 0.0281 0.802 2.11
#> 6 5 0.0181 0.833 2.11
#> 7 6 0.0397 0.815 2.11
#> 8 7 0.0436 0.821 2.11
#> 9 8 0.0104 0.852 2.11
#> 10 9 0.0284 0.811 2.11
#> 11 10 0.0257 0.798 2.11
#> 12 99 0.0169 0.815 2.11Created on 2025-06-03 with reprex v2.1.1


So, multiple imputation yields slightly smaller prediction interval widths than those obtained under regression imputation, but also overcovers marginally (note that the sumna = 99 corresponds to the marginal coverage). On the upside, the conditional coverage is much better than the conditional coverage under regression imputation.