Skip to content

Instantly share code, notes, and snippets.

@thomvolker
Created March 4, 2025 16:15
Show Gist options
  • Save thomvolker/349114990b871ac1b3ee8b0b3012273c to your computer and use it in GitHub Desktop.
Save thomvolker/349114990b871ac1b3ee8b0b3012273c to your computer and use it in GitHub Desktop.

A slightly modified estimice() and .norm.draw()

Thom Benjamin Volker

04-03-2025

An attempt at a faster estimice() function for the R-package mice.

devtools::load_all(path = "C:/Users/5868777/surfdrive/Documents/mice")
#> ℹ Loading mice

# Small data example

N <- 100
P <- 5
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)

# Check for same result
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr")
#> $coef
#>          [,1]
#> x1  1.2435131
#> x2  1.6511610
#> x3  0.8942764
#> x4  1.3913726
#> x5 -0.1732622
#> 
#> $beta
#>          [,1]
#> x1  2.5601151
#> x2  2.7015099
#> x3  0.2100016
#> x4 -0.8644491
#> x5  0.2907068
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")
#> $coef
#>         x1         x2         x3         x4         x5 
#>  1.2435131  1.6511610  0.8942764  1.3913726 -0.1732622 
#> 
#> $beta
#>         x1         x2         x3         x4         x5 
#>  2.5601151  2.7015099  0.2100016 -0.8644491  0.2907068 
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"

rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10000
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new        10000    0.46    1.000      0.42     0.03         NA        NA
#> 1  old        10000    0.79    1.717      0.69     0.08         NA        NA

# Large data example

N <- 2000
P <- 500
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)

rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new           10    1.74    1.000      1.69     0.00         NA        NA
#> 1  old           10    2.49    1.431      2.38     0.04         NA        NA

Created on 2025-03-04 with reprex v2.0.2

These results were obtained with the following functions.

.norm.draw.new <- function(y, ry, x, rank.adjust = TRUE, ...) {
  parm <- estimice2(x[ry, , drop = FALSE], y[ry], ...)
  if (any(is.na(parm$coef)) & rank.adjust) {
    parm$coef[is.na(parm$coef)] <- 0
    parm$beta[is.na(parm$beta)] <- 0
  }
  parm
}

estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-5, ...) {
  p <- ncol(x)
  df <- max(length(y) - p, 1)
  if (ls.meth == "qr") {
    qr <- lm.fit(x = x, y = y)
    R <- qr.R(qr$qr)
    if (qr$rank < p) {
      pen <- ridge * diag(crossprod(R))
      pen[1,1] <- 0 # not penalize intercept
      qr <- lm.fit(x = rbind(x, pen), y = c(y, rep(0, p)))
      qr <- .trim.qr.ridge(qr, p)
      mess <- paste0(
        "mice detected that your data are (nearly) multi-collinear.\n",
        "It applied a ridge penalty to continue calculations, but the results can be unstable.\n",
        "Does your dataset contain duplicates, linear transformation, or factors with unique respondent names?"
      )
      updateLog(out = mess, frame = 6)
      if (get.printFlag()) {
        cat("*")
      }
    }
    sigma.star <- sqrt(c(crossprod(qr$residuals)) / rchisq(1, df))
  }
  list(
    coef = qr$coefficients, 
    beta = qr$coefficients + c(crossprod(chol(chol2inv(R)), rnorm(p)) * sigma.star),
    sigma = sigma.star,
    estimation = ls.meth
  )
}

.trim.qr.ridge <- function(qr, p) {
  n <- seq_len(length(qr$residuals) - p)
  qr$residuals <- qr$residuals[n]
  qr$fitted.values <- qr$fitted.values[n]
  qr$qr$qr <- qr$qr$qr[n, , drop = FALSE]
  qr
}
@thomvolker
Copy link
Author

thomvolker commented Oct 6, 2025

estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-05, ...) {
  dim <- dim(x)

  if (ls.meth == "qr") {
    fit <- .lm.fit(x = x, y = y)
    df <- dim[1] - fit$rank
    rankdef <- is_rank_deficient(fit$rank, dim[1], dim[2])
    resids <- fit$residuals

    if (rankdef) {
      if (fit$rank < dim[1]) {
        # collinear system but p < n; kick out collinear predictors
        piv <- match(seq_len(dim[2]), fit$pivot)
        coefs <- fit$coefficients[piv] # re-order coefficients such that correct ones are set to zero
        S <- matrix(0, dim[2], dim[2])
        S[seq_len(fit$rank), seq_len(fit$rank)] <- chol(QR_to_inv(
          fit$qr,
          fit$rank
        ))
        Sigma <- S[piv, piv, drop = FALSE]
      } else {
        ls.meth <- "ridge"
      }
      message <- paste0(
        "Your data are super-collinear. See `?is_rank_deficient` for more information."
      )
      #updateLog(out = mess, frame = 6)
      #if (get.printFlag()) {
      #  cat("*")
      #} # indicator of added ridge penalty in the printed iteration history
    } else {
      coefs <- fit$coefficients
      Sigma <- chol(QR_to_inv(fit$qr, fit$rank))
    }
  }
  if (ls.meth == "ridge") {
    pen <- ridge * apply(x, 2, var) * sqrt(dim[1] * (dim[1]-1) / var(y)) # same scaling as glmnet
    fit <- .lm.fit(
      rbind(x, diag(sqrt(pen))),
      c(y, rep(0, dim[2]))
    )
    coefs <- fit$coefficients
    inv <- QR_to_inv(fit$qr, fit$rank)
    Sigma <- chol(inv)
    df <- pmax(1, dim[1] - sum(diag(x %*% inv %*% t(x)))) # penalty matrix is diagonal, so t(pen) == pen
    resids <- fit$residuals
  }
  if (ls.meth == "svd") {
    fit <- NULL
    eps <- sqrt(.Machine$double.eps)
    s <- svd(x)
    rank <- sum(s$d > eps)
    if (rank == dim[1]) {
      dinv <- ifelse(s$d > eps, s$d / (s$d^2 + ridge), 0)
    } else {
      dinv <- ifelse(s$d > eps, 1/s$d, 0)
    }
    coefs <- s$v %*% (crossprod(s$u, y) * dinv)
    resids <- y - x %*% coefs
    Sigma <- t(s$v[,seq_len(rank),drop = FALSE] %*% diag(dinv[seq_len(rank)], nrow = rank))
    df <- pmax(dim[1] - rank, 1)
    dim[2] <- nrow(Sigma) # draw coefficients in subspace
  }
  sigma.star <- sqrt(c(crossprod(resids)) / rchisq(1, df))
  Sigma <- sigma.star * Sigma
  out <- list(
    coef = as.matrix(coefs),
    beta = as.matrix(coefs + c(crossprod(Sigma, rnorm(dim[2])))),
    sigma = sigma.star,
    estimation = ls.meth
  )
  out
}


get.printFlag <- function(start = 4) {
  while (inherits(
    try(get("printFlag", parent.frame(start)), silent = TRUE),
    "try-error"
  )) {
    start <- start + 1
  }
  get("printFlag", parent.frame(start))
}

is_rank_deficient <- function(rank, n, p) {
  rank < p | rank == n
}


QR_to_inv <- function(qr, rank) {
  R <- qr[1:rank, 1:rank, drop = FALSE]
  R[lower.tri(R)] <- 0
  chol2inv(R)
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment