Skip to content

Instantly share code, notes, and snippets.

@thomvolker
Created March 4, 2025 16:15
Show Gist options
  • Save thomvolker/349114990b871ac1b3ee8b0b3012273c to your computer and use it in GitHub Desktop.
Save thomvolker/349114990b871ac1b3ee8b0b3012273c to your computer and use it in GitHub Desktop.

A slightly modified estimice() and .norm.draw()

Thom Benjamin Volker

04-03-2025

An attempt at a faster estimice() function for the R-package mice.

devtools::load_all(path = "C:/Users/5868777/surfdrive/Documents/mice")
#> ℹ Loading mice

# Small data example

N <- 100
P <- 5
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)

# Check for same result
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr")
#> $coef
#>          [,1]
#> x1  1.2435131
#> x2  1.6511610
#> x3  0.8942764
#> x4  1.3913726
#> x5 -0.1732622
#> 
#> $beta
#>          [,1]
#> x1  2.5601151
#> x2  2.7015099
#> x3  0.2100016
#> x4 -0.8644491
#> x5  0.2907068
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")
#> $coef
#>         x1         x2         x3         x4         x5 
#>  1.2435131  1.6511610  0.8942764  1.3913726 -0.1732622 
#> 
#> $beta
#>         x1         x2         x3         x4         x5 
#>  2.5601151  2.7015099  0.2100016 -0.8644491  0.2907068 
#> 
#> $sigma
#> [1] 1.079444
#> 
#> $estimation
#> [1] "qr"

rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10000
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new        10000    0.46    1.000      0.42     0.03         NA        NA
#> 1  old        10000    0.79    1.717      0.69     0.08         NA        NA

# Large data example

N <- 2000
P <- 500
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)

rbenchmark::benchmark(
  old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
  new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
  replications = 10
)
#>   test replications elapsed relative user.self sys.self user.child sys.child
#> 2  new           10    1.74    1.000      1.69     0.00         NA        NA
#> 1  old           10    2.49    1.431      2.38     0.04         NA        NA

Created on 2025-03-04 with reprex v2.0.2

These results were obtained with the following functions.

.norm.draw.new <- function(y, ry, x, rank.adjust = TRUE, ...) {
  parm <- estimice2(x[ry, , drop = FALSE], y[ry], ...)
  if (any(is.na(parm$coef)) & rank.adjust) {
    parm$coef[is.na(parm$coef)] <- 0
    parm$beta[is.na(parm$beta)] <- 0
  }
  parm
}

estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-5, ...) {
  p <- ncol(x)
  df <- max(length(y) - p, 1)
  if (ls.meth == "qr") {
    qr <- lm.fit(x = x, y = y)
    R <- qr.R(qr$qr)
    if (qr$rank < p) {
      pen <- ridge * diag(crossprod(R))
      pen[1,1] <- 0 # not penalize intercept
      qr <- lm.fit(x = rbind(x, pen), y = c(y, rep(0, p)))
      qr <- .trim.qr.ridge(qr, p)
      mess <- paste0(
        "mice detected that your data are (nearly) multi-collinear.\n",
        "It applied a ridge penalty to continue calculations, but the results can be unstable.\n",
        "Does your dataset contain duplicates, linear transformation, or factors with unique respondent names?"
      )
      updateLog(out = mess, frame = 6)
      if (get.printFlag()) {
        cat("*")
      }
    }
    sigma.star <- sqrt(c(crossprod(qr$residuals)) / rchisq(1, df))
  }
  list(
    coef = qr$coefficients, 
    beta = qr$coefficients + c(crossprod(chol(chol2inv(R)), rnorm(p)) * sigma.star),
    sigma = sigma.star,
    estimation = ls.meth
  )
}

.trim.qr.ridge <- function(qr, p) {
  n <- seq_len(length(qr$residuals) - p)
  qr$residuals <- qr$residuals[n]
  qr$fitted.values <- qr$fitted.values[n]
  qr$qr$qr <- qr$qr$qr[n, , drop = FALSE]
  qr
}
@thomvolker
Copy link
Author

setwd("c:/Users/5868777/surfdrive/Documents/mice")
devtools::load_all()
#> ℹ Loading mice

P <- 50
N <- 1000

m <- 1:P
V <- diag(sqrt(1:P)) %*% (0.5 + 0.5 * diag(P)) %*% diag(sqrt(1:P))

X <- rnorm(N*P) |> matrix(N)
X <- X - t(colMeans(X)) %x% rep(1, N)
X <- X %*% solve(chol(var(X)))

X <- X %*% chol(V)
X <- X + t(m) %x% rep(1, N)


fit <- lm(X[,50]~X[,1:49])
coefs <- coef(fit)
vars <- vcov(fit)

reps <- sapply(1:1000, \(x) {
  estimice2(cbind(1, X[,1:49]), X[,50], ls.meth = "qr")$beta
})

par(mfrow = c(3, 1))
for (i in 1:3) {
  hist(reps[i,], breaks = 50)
  abline(v = coefs[i])
  lower <- coefs[i] - qt(0.975, pmax(N-P, 1)) * sqrt(vars[i,i])
  upper <- coefs[i] + qt(0.975, pmax(N-P, 1)) * sqrt(vars[i,i])
  abline(v = lower, col = "red")
  abline(v = upper, col = "red")
  cat("Coverage (var %i): ", mean(reps[i,] > lower & reps[i,] < upper), "\n")
}
#> Coverage (var %i):  0.948
#> Coverage (var %i):  0.944

#> Coverage (var %i):  0.952

reps <- sapply(1:1000, \(x) {
  estimice2(cbind(1, X[,1:49]), X[,50], ls.meth = "ridge")$beta
})

par(mfrow = c(3, 1))
for (i in 1:3) {
  hist(reps[i,])
  abline(v = coefs[i])
  lower <- coefs[i] - qt(0.975, pmax(N-P, 1)) * sqrt(vars[i,i])
  upper <- coefs[i] + qt(0.975, pmax(N-P, 1)) * sqrt(vars[i,i])
  abline(v = lower, col = "red")
  abline(v = upper, col = "red")
  cat("Coverage (var %i): ", mean(reps[i,] > lower & reps[i,] < upper), "\n")
}
#> Coverage (var %i):  0.95
#> Coverage (var %i):  0.949

#> Coverage (var %i):  0.948

reps <- sapply(1:1000, \(x) {
  estimice2(cbind(1, X[,1:49]), X[,50], ls.meth = "svd")$beta
})

par(mfrow = c(3, 1))
for (i in 1:3) {
  hist(reps[i,], breaks = 100)
  abline(v = coefs[i])
  lower <- coefs[i] - qt(0.975, pmax(N-P, 1)) * sqrt(vars[i,i])
  upper <- coefs[i] + qt(0.975, pmax(N-P, 1)) * sqrt(vars[i,i])
  abline(v = lower, col = "red")
  abline(v = upper, col = "red")
  cat("Coverage (var %i): ", mean(reps[i,] > lower & reps[i,] < upper), "\n")
}
#> Coverage (var %i):  0.944
#> Coverage (var %i):  0.943

#> Coverage (var %i):  0.938

reps <- sapply(1:1000, \(x) {
  estimice2(cbind(1, X[1:20,1:49]), X[1:20,50], ls.meth = "qr")$beta
})

par(mfrow = c(3, 1))
for (i in 1:3) {
  hist(reps[i,], breaks = 50)
  abline(v = coefs[i])
}

reps <- sapply(1:1000, \(x) {
  estimice2(cbind(1, X[1:20,1:49]), X[1:20,50], ls.meth = "ridge")$beta
})

par(mfrow = c(3, 1))
for (i in 1:3) {
  hist(reps[i,])
  abline(v = coefs[i])
}

reps <- sapply(1:1000, \(x) {
  estimice2(cbind(1, X[1:20,1:49]), X[1:20,50], ls.meth = "svd")$beta
})

par(mfrow = c(3, 1))
for (i in 1:3) {
  hist(reps[i,], breaks = 100)
  abline(v = coefs[i])
}

# Compare estimice2 and estimice

bench::mark(
  fold = {set.seed(1); .norm.draw(X[,50], rep(TRUE, N), X[,1:49])},
  fnew = {set.seed(1); .norm.draw.new(X[,50], rep(TRUE, N), X[,1:49])},
  check = FALSE
)
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 fold         1.06ms   1.32ms      720.    1.44MB     22.7
#> 2 fnew        935.1µs   1.21ms      840.     1.3MB     25.3

all.equal(
  target = {set.seed(1); .norm.draw(X[,50], rep(TRUE, N), X[,1:49])},
  current = {set.seed(1); .norm.draw.new(X[,50], rep(TRUE, N), X[,1:49])},
  check.attributes = FALSE
)
#> [1] TRUE

bench::mark(
  fold = {set.seed(1); .norm.draw(X[1:50,10], rep(TRUE, 50), X[1:50,1:9])},
  fnew = {set.seed(1); .norm.draw.new(X[1:50,10], rep(TRUE, 50), X[1:50,1:9])},
  check = FALSE
)
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 fold         58.5µs   77.5µs    12475.    24.3KB     6.37
#> 2 fnew           28µs   35.9µs    27105.    18.7KB    10.8

all.equal(
  target = {set.seed(1); .norm.draw(X[1:50,10], rep(TRUE, 50), X[1:50,1:9])},
  current = {set.seed(1); .norm.draw.new(X[1:50,10], rep(TRUE, 50), X[1:50,1:9])},
  check.attributes = FALSE
)
#> [1] TRUE

bench::mark(
  fold = {set.seed(1); .norm.draw(X[1:40,50], rep(TRUE, 40), X[1:40,1:49])},
  fnew = {set.seed(1); .norm.draw.new(X[1:40,50], rep(TRUE, 40), X[1:40,1:49])},
  check = FALSE  
)
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 fold          408µs    540µs     1697.     392KB     13.1
#> 2 fnew          464µs    557µs     1788.     352KB     13.1

Created on 2025-10-04 with reprex v2.1.1

@thomvolker
Copy link
Author

thomvolker commented Oct 6, 2025

estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-05, ...) {
  dim <- dim(x)

  if (ls.meth == "qr") {
    fit <- .lm.fit(x = x, y = y)
    df <- dim[1] - fit$rank
    rankdef <- is_rank_deficient(fit$rank, dim[1], dim[2])
    resids <- fit$residuals

    if (rankdef) {
      if (fit$rank < dim[1]) {
        # collinear system but p < n; kick out collinear predictors
        piv <- match(seq_len(dim[2]), fit$pivot)
        coefs <- fit$coefficients[piv] # re-order coefficients such that correct ones are set to zero
        S <- matrix(0, dim[2], dim[2])
        S[seq_len(fit$rank), seq_len(fit$rank)] <- chol(QR_to_inv(
          fit$qr,
          fit$rank
        ))
        Sigma <- S[piv, piv, drop = FALSE]
      } else {
        ls.meth <- "ridge"
      }
      message <- paste0(
        "Your data are super-collinear. See `?is_rank_deficient` for more information."
      )
      #updateLog(out = mess, frame = 6)
      #if (get.printFlag()) {
      #  cat("*")
      #} # indicator of added ridge penalty in the printed iteration history
    } else {
      coefs <- fit$coefficients
      Sigma <- chol(QR_to_inv(fit$qr, fit$rank))
    }
  }
  if (ls.meth == "ridge") {
    pen <- ridge * apply(x, 2, var) * sqrt(dim[1] * (dim[1]-1) / var(y)) # same scaling as glmnet
    fit <- .lm.fit(
      rbind(x, diag(sqrt(pen))),
      c(y, rep(0, dim[2]))
    )
    coefs <- fit$coefficients
    inv <- QR_to_inv(fit$qr, fit$rank)
    Sigma <- chol(inv)
    df <- pmax(1, dim[1] - sum(diag(x %*% inv %*% t(x)))) # penalty matrix is diagonal, so t(pen) == pen
    resids <- fit$residuals
  }
  if (ls.meth == "svd") {
    fit <- NULL
    eps <- sqrt(.Machine$double.eps)
    s <- svd(x)
    rank <- sum(s$d > eps)
    if (rank == dim[1]) {
      dinv <- ifelse(s$d > eps, s$d / (s$d^2 + ridge), 0)
    } else {
      dinv <- ifelse(s$d > eps, 1/s$d, 0)
    }
    coefs <- s$v %*% (crossprod(s$u, y) * dinv)
    resids <- y - x %*% coefs
    Sigma <- t(s$v[,seq_len(rank),drop = FALSE] %*% diag(dinv[seq_len(rank)], nrow = rank))
    df <- pmax(dim[1] - rank, 1)
    dim[2] <- nrow(Sigma) # draw coefficients in subspace
  }
  sigma.star <- sqrt(c(crossprod(resids)) / rchisq(1, df))
  Sigma <- sigma.star * Sigma
  out <- list(
    coef = as.matrix(coefs),
    beta = as.matrix(coefs + c(crossprod(Sigma, rnorm(dim[2])))),
    sigma = sigma.star,
    estimation = ls.meth
  )
  out
}


get.printFlag <- function(start = 4) {
  while (inherits(
    try(get("printFlag", parent.frame(start)), silent = TRUE),
    "try-error"
  )) {
    start <- start + 1
  }
  get("printFlag", parent.frame(start))
}

is_rank_deficient <- function(rank, n, p) {
  rank < p | rank == n
}


QR_to_inv <- function(qr, rank) {
  R <- qr[1:rank, 1:rank, drop = FALSE]
  R[lower.tri(R)] <- 0
  chol2inv(R)
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment