Skip to content

Instantly share code, notes, and snippets.

@thomvolker
Created June 3, 2025 12:56
Show Gist options
  • Save thomvolker/c4e3e439c76923f40ef8487a8c29e1be to your computer and use it in GitHub Desktop.
Save thomvolker/c4e3e439c76923f40ef8487a8c29e1be to your computer and use it in GitHub Desktop.
set.seed(123)

# Number of predictors
P <- 10
# Sample size
N <- 10000
# Generate variance covariance matrix as toeplitz
V <- toeplitz(P:1/P)
# Relative size of coefficients
b <- 1:P
# Set coefficient of determination
R2 <- 0.5
# Calculate regression coefficients
b <- b * c(sqrt(R2 / b %*% V %*% b))

# Generate data
X <- matrix(rnorm(N * P), nrow = N, ncol = P) %*% chol(V)
y <- X %*% b + rnorm(N, 0, sqrt(1-R2))

# Collect data in data frame
df <- data.frame(X = X, y = y)

# Induce missings
amp <- mice::ampute(
  df, 
  prop = 0.5, 
  patterns = 1 - lower.tri(matrix(1, P, P+1), TRUE),
  mech = "MCAR"
)$amp

# Perform regression imputation
predimp <- mice::mice(
  amp, 
  m = 1, 
  method = "norm.predict", 
  predictorMatrix = cbind(1 - diag(nrow = P+1, ncol = P), 0),
  print = FALSE
)

# Perform multiple imputation
miimp <- mice::mice(
  amp, m = 10, maxit = 10, method = "norm", print = FALSE
)

# Fit quantile regression model with regression imputation data
predfit <- with(
  predimp, quantreg::rq(y~X.1 + X.2 + X.3 + X.4 + X.5, tau = c(0.1, 0.5, 0.9))
)
# Calculate prediction interval for y
predint <- predfit$analyses[[1]] |> predict()

# Fit quantile regression model with multiple imputation data
mifit <- with(
  miimp, quantreg::rq(y~X.1 + X.2 + X.3 + X.4 + X.5, tau = c(0.1, 0.5, 0.9))
)
# Calculate quantile regression prediction interval
miint <- lapply(mifit$analyses, predict) |>
  unlist() |>
  array(dim = c(N, 3, 10)) |>
  apply(c(1,2), mean)

# Plot predicted versus observed y values with corresponding prediction intervals
# under regression imputation
predorder <- order(predint[,2])
plot(predint[predorder,2], y[predorder])
points(predint[predorder, 2][rowSums(is.na(amp)) >= 4],
       y[predorder][rowSums(is.na(amp)) >= 4], col = "red")
segments(x0 = predint[predorder, 2],
         y0 = predint[predorder, 1],
         y1 = predint[predorder, 3],
         col = "lightblue")

# Evaluate marginal and conditional coverage of prediction intervals with
# corresponding interval width under regression imputation
cbind(amp, predint) |>
  dplyr::mutate(sumna = rowSums(is.na(amp))) |>
  dplyr::group_by(sumna) |>
  dplyr::summarise(pred = mean(`2`),
                   cov = mean(`1` < y & y < `3`),
                   width = mean(`3` - `1`)) |>
  dplyr::bind_rows(data.frame(sumna = 99, 
                              pred = mean(predint[,2]),
                              cov = mean(predint[,1] < y & y < predint[,3]),
                              width = mean(predint[,3] - predint[,1])))
#> # A tibble: 12 × 4
#>    sumna     pred   cov width
#>    <dbl>    <dbl> <dbl> <dbl>
#>  1     0  0.00322 0.814  2.14
#>  2     1  0.0613  0.841  2.14
#>  3     2  0.0243  0.828  2.14
#>  4     3 -0.00620 0.815  2.14
#>  5     4  0.0278  0.821  2.14
#>  6     5  0.0125  0.831  2.14
#>  7     6  0.0319  0.784  2.14
#>  8     7  0.0303  0.760  2.14
#>  9     8 -0.00505 0.751  2.14
#> 10     9  0.0228  0.711  2.14
#> 11    10  0.0103  0.716  2.14
#> 12    99  0.0122  0.799  2.14

# Plot predicted versus observed y values with corresponding prediction intervals
# under multiple imputation
predorder <- order(miint[,2])
plot(miint[predorder,2], y[predorder])
points(miint[predorder, 2][rowSums(is.na(amp)) >= 4],
       y[predorder][rowSums(is.na(amp)) >= 4], col = "red")
segments(x0 = miint[predorder, 2],
         y0 = miint[predorder, 1],
         y1 = miint[predorder, 3],
         col = "lightblue")

# Evaluate marginal and conditional coverage of prediction intervals with
# corresponding interval width under multiple imputation
cbind(amp, miint) |>
  dplyr::mutate(sumna = rowSums(is.na(amp))) |>
  dplyr::group_by(sumna) |>
  dplyr::summarise(pred = mean(`2`),
                   cov = mean(`1` < y & y < `3`),
                   width = mean(`3` - `1`)) |>
  dplyr::bind_rows(data.frame(sumna = 99, 
                              pred = mean(miint[,2]),
                              cov = mean(miint[,1] < y & y < miint[,3]),
                              width = mean(miint[,3] - miint[,1])))
#> # A tibble: 12 × 4
#>    sumna     pred   cov width
#>    <dbl>    <dbl> <dbl> <dbl>
#>  1     0  0.00556 0.811  2.11
#>  2     1  0.0635  0.839  2.11
#>  3     2  0.0239  0.824  2.11
#>  4     3 -0.00259 0.801  2.11
#>  5     4  0.0281  0.802  2.11
#>  6     5  0.0181  0.833  2.11
#>  7     6  0.0397  0.815  2.11
#>  8     7  0.0436  0.821  2.11
#>  9     8  0.0104  0.852  2.11
#> 10     9  0.0284  0.811  2.11
#> 11    10  0.0257  0.798  2.11
#> 12    99  0.0169  0.815  2.11

Created on 2025-06-03 with reprex v2.1.1

@thomvolker
Copy link
Author

So, multiple imputation yields slightly smaller prediction interval widths than those obtained under regression imputation, but also overcovers marginally (note that the sumna = 99 corresponds to the marginal coverage). On the upside, the conditional coverage is much better than the conditional coverage under regression imputation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment