A slightly modified `estimice()` and `.norm.draw()`

Thom Benjamin Volker

04-03-2025

An attempt at a faster estimice() function for the R-package mice.

devtools::load_all(path = "C:/Users/5868777/surfdrive/Documents/mice")
#> ℹ Loading mice

# Small data example

N <- 100
P <- 5
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)

# Check for same result
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr")
#> $coef
#>          [,1]
#> x1  1.2435131
#> x2  1.6511610
#> x3  0.8942764
#> x4  1.3913726
#> x5 -0.1732622
#> 
#> $beta
#>          [,1]
#> x1  2.5601151
#> x2  2.7015099
#> x3  0.2100016
#> x4 -0.8644491
#> x5  0.2907068
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")
#> $coef
#>         x1         x2         x3         x4         x5 
#>  1.2435131  1.6511610  0.8942764  1.3913726 -0.1732622 
#> 
#> $beta
#>         x1         x2         x3         x4         x5 
#>  2.5601151  2.7015099  0.2100016 -0.8644491  0.2907068 
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"

rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10000
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new        10000    0.46    1.000      0.42     0.03         NA        NA
#> 1  old        10000    0.79    1.717      0.69     0.08         NA        NA

# Large data example

N <- 2000
P <- 500
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)

rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new           10    1.74    1.000      1.69     0.00         NA        NA
#> 1  old           10    2.49    1.431      2.38     0.04         NA        NA

^{Created on 2025-03-04 with reprex v2.0.2}

These results were obtained with the following functions.

.norm.draw.new <- function(y, ry, x, rank.adjust = TRUE, ...) {
  parm <- estimice2(x[ry, , drop = FALSE], y[ry], ...)
  if (any(is.na(parm$coef)) & rank.adjust) {
    parm$coef[is.na(parm$coef)] <- 0
    parm$beta[is.na(parm$beta)] <- 0
  }
  parm
}

estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-5, ...) {
  p <- ncol(x)
  df <- max(length(y) - p, 1)
  if (ls.meth == "qr") {
    qr <- lm.fit(x = x, y = y)
    R <- qr.R(qr$qr)
    if (qr$rank < p) {
      pen <- ridge * diag(crossprod(R))
      pen[1,1] <- 0 # not penalize intercept
      qr <- lm.fit(x = rbind(x, pen), y = c(y, rep(0, p)))
      qr <- .trim.qr.ridge(qr, p)
      mess <- paste0(
        "mice detected that your data are (nearly) multi-collinear.\n",
        "It applied a ridge penalty to continue calculations, but the results can be unstable.\n",
        "Does your dataset contain duplicates, linear transformation, or factors with unique respondent names?"
      )
      updateLog(out = mess, frame = 6)
      if (get.printFlag()) {
        cat("*")
      }
    }
    sigma.star <- sqrt(c(crossprod(qr$residuals)) / rchisq(1, df))
  }
  list(
    coef = qr$coefficients, 
    beta = qr$coefficients + c(crossprod(chol(chol2inv(R)), rnorm(p)) * sigma.star),
    sigma = sigma.star,
    estimation = ls.meth
  )
}

.trim.qr.ridge <- function(qr, p) {
  n <- seq_len(length(qr$residuals) - p)
  qr$residuals <- qr$residuals[n]
  qr$fitted.values <- qr$fitted.values[n]
  qr$qr$qr <- qr$qr$qr[n, , drop = FALSE]
  qr
}

estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-05, ...) { dim <- dim(x) if (ls.meth == "qr") { fit <- .lm.fit(x = x, y = y) df <- dim[1] - fit$rank rankdef <- is_rank_deficient(fit$rank, dim[1], dim[2]) resids <- fit$residuals if (rankdef) { if (fit$rank < dim[1]) { # collinear system but p < n; kick out collinear predictors piv <- match(seq_len(dim[2]), fit$pivot) coefs <- fit$coefficients[piv] # re-order coefficients such that correct ones are set to zero S <- matrix(0, dim[2], dim[2]) S[seq_len(fit$rank), seq_len(fit$rank)] <- chol(QR_to_inv( fit$qr, fit$rank )) Sigma <- S[piv, piv, drop = FALSE] } else { ls.meth <- "ridge" } message <- paste0( "Your data are super-collinear. See `?is_rank_deficient` for more information." ) #updateLog(out = mess, frame = 6) #if (get.printFlag()) { # cat("*") #} # indicator of added ridge penalty in the printed iteration history } else { coefs <- fit$coefficients Sigma <- chol(QR_to_inv(fit$qr, fit$rank)) } } if (ls.meth == "ridge") { pen <- ridge * apply(x, 2, var) * sqrt(dim[1] * (dim[1]-1) / var(y)) # same scaling as glmnet fit <- .lm.fit( rbind(x, diag(sqrt(pen))), c(y, rep(0, dim[2])) ) coefs <- fit$coefficients inv <- QR_to_inv(fit$qr, fit$rank) Sigma <- chol(inv) df <- pmax(1, dim[1] - sum(diag(x %*% inv %*% t(x)))) # penalty matrix is diagonal, so t(pen) == pen resids <- fit$residuals } if (ls.meth == "svd") { fit <- NULL eps <- sqrt(.Machine$double.eps) s <- svd(x) rank <- sum(s$d > eps) if (rank == dim[1]) { dinv <- ifelse(s$d > eps, s$d / (s$d^2 + ridge), 0) } else { dinv <- ifelse(s$d > eps, 1/s$d, 0) } coefs <- s$v %*% (crossprod(s$u, y) * dinv) resids <- y - x %*% coefs Sigma <- t(s$v[,seq_len(rank),drop = FALSE] %*% diag(dinv[seq_len(rank)], nrow = rank)) df <- pmax(dim[1] - rank, 1) dim[2] <- nrow(Sigma) # draw coefficients in subspace } sigma.star <- sqrt(c(crossprod(resids)) / rchisq(1, df)) Sigma <- sigma.star * Sigma out <- list( coef = as.matrix(coefs), beta = as.matrix(coefs + c(crossprod(Sigma, rnorm(dim[2])))), sigma = sigma.star, estimation = ls.meth ) out } get.printFlag <- function(start = 4) { while (inherits( try(get("printFlag", parent.frame(start)), silent = TRUE), "try-error" )) { start <- start + 1 } get("printFlag", parent.frame(start)) } is_rank_deficient <- function(rank, n, p) { rank < p | rank == n } QR_to_inv <- function(qr, rank) { R <- qr[1:rank, 1:rank, drop = FALSE] R[lower.tri(R)] <- 0 chol2inv(R) }

thomvolker/faster_estimice.md

A slightly modified `estimice()` and `.norm.draw()`

Thom Benjamin Volker

thomvolker commented Oct 6, 2025 •

edited

Loading

Uh oh!

thomvolker/faster_estimice.md

A slightly modified estimice() and .norm.draw()

Thom Benjamin Volker

thomvolker commented Oct 6, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

A slightly modified `estimice()` and `.norm.draw()`

thomvolker commented Oct 6, 2025 •

edited

Loading