04-03-2025
An attempt at a faster estimice() function for the R-package mice.
devtools::load_all(path = "C:/Users/5868777/surfdrive/Documents/mice")
#> ℹ Loading mice
# Small data example
N <- 100
P <- 5
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)
# Check for same result
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr")
#> $coef
#>          [,1]
#> x1  1.2435131
#> x2  1.6511610
#> x3  0.8942764
#> x4  1.3913726
#> x5 -0.1732622
#> 
#> $beta
#>          [,1]
#> x1  2.5601151
#> x2  2.7015099
#> x3  0.2100016
#> x4 -0.8644491
#> x5  0.2907068
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")
#> $coef
#>         x1         x2         x3         x4         x5 
#>  1.2435131  1.6511610  0.8942764  1.3913726 -0.1732622 
#> 
#> $beta
#>         x1         x2         x3         x4         x5 
#>  2.5601151  2.7015099  0.2100016 -0.8644491  0.2907068 
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"
rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10000
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new        10000    0.46    1.000      0.42     0.03         NA        NA
#> 1  old        10000    0.79    1.717      0.69     0.08         NA        NA
# Large data example
N <- 2000
P <- 500
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)
rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new           10    1.74    1.000      1.69     0.00         NA        NA
#> 1  old           10    2.49    1.431      2.38     0.04         NA        NACreated on 2025-03-04 with reprex v2.0.2
These results were obtained with the following functions.
.norm.draw.new <- function(y, ry, x, rank.adjust = TRUE, ...) {
  parm <- estimice2(x[ry, , drop = FALSE], y[ry], ...)
  if (any(is.na(parm$coef)) & rank.adjust) {
    parm$coef[is.na(parm$coef)] <- 0
    parm$beta[is.na(parm$beta)] <- 0
  }
  parm
}
estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-5, ...) {
  p <- ncol(x)
  df <- max(length(y) - p, 1)
  if (ls.meth == "qr") {
    qr <- lm.fit(x = x, y = y)
    R <- qr.R(qr$qr)
    if (qr$rank < p) {
      pen <- ridge * diag(crossprod(R))
      pen[1,1] <- 0 # not penalize intercept
      qr <- lm.fit(x = rbind(x, pen), y = c(y, rep(0, p)))
      qr <- .trim.qr.ridge(qr, p)
      mess <- paste0(
        "mice detected that your data are (nearly) multi-collinear.\n",
        "It applied a ridge penalty to continue calculations, but the results can be unstable.\n",
        "Does your dataset contain duplicates, linear transformation, or factors with unique respondent names?"
      )
      updateLog(out = mess, frame = 6)
      if (get.printFlag()) {
        cat("*")
      }
    }
    sigma.star <- sqrt(c(crossprod(qr$residuals)) / rchisq(1, df))
  }
  list(
    coef = qr$coefficients, 
    beta = qr$coefficients + c(crossprod(chol(chol2inv(R)), rnorm(p)) * sigma.star),
    sigma = sigma.star,
    estimation = ls.meth
  )
}
.trim.qr.ridge <- function(qr, p) {
  n <- seq_len(length(qr$residuals) - p)
  qr$residuals <- qr$residuals[n]
  qr$fitted.values <- qr$fitted.values[n]
  qr$qr$qr <- qr$qr$qr[n, , drop = FALSE]
  qr
}
Created on 2025-10-04 with reprex v2.1.1