04-03-2025
An attempt at a faster estimice() function for the R-package mice.
devtools::load_all(path = "C:/Users/5868777/surfdrive/Documents/mice")
#> ℹ Loading mice
# Small data example
N <- 100
P <- 5
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)
# Check for same result
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr")
#> $coef
#> [,1]
#> x1 1.2435131
#> x2 1.6511610
#> x3 0.8942764
#> x4 1.3913726
#> x5 -0.1732622
#>
#> $beta
#> [,1]
#> x1 2.5601151
#> x2 2.7015099
#> x3 0.2100016
#> x4 -0.8644491
#> x5 0.2907068
#>
#> $sigma
#> [1] 1.079444
#>
#> $estimation
#> [1] "qr"
set.seed(1)
.norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")
#> $coef
#> x1 x2 x3 x4 x5
#> 1.2435131 1.6511610 0.8942764 1.3913726 -0.1732622
#>
#> $beta
#> x1 x2 x3 x4 x5
#> 2.5601151 2.7015099 0.2100016 -0.8644491 0.2907068
#>
#> $sigma
#> [1] 1.079444
#>
#> $estimation
#> [1] "qr"
rbenchmark::benchmark(
old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
replications = 10000
)
#> test replications elapsed relative user.self sys.self user.child sys.child
#> 2 new 10000 0.46 1.000 0.42 0.03 NA NA
#> 1 old 10000 0.79 1.717 0.69 0.08 NA NA
# Large data example
N <- 2000
P <- 500
X <- matrix(rnorm(N*P), N) %*% chol(0.99 + 0.01 * diag(P))
y <- X %*% rep(1, P) + rnorm(N)
y[5:6] <- NA
ry <- !is.na(y)
rbenchmark::benchmark(
old = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr")},
new = {set.seed(1); .norm.draw(y, ry, X, ls.meth = "qr", estfunc = "new")},
replications = 10
)
#> test replications elapsed relative user.self sys.self user.child sys.child
#> 2 new 10 1.74 1.000 1.69 0.00 NA NA
#> 1 old 10 2.49 1.431 2.38 0.04 NA NACreated on 2025-03-04 with reprex v2.0.2
These results were obtained with the following functions.
.norm.draw.new <- function(y, ry, x, rank.adjust = TRUE, ...) {
parm <- estimice2(x[ry, , drop = FALSE], y[ry], ...)
if (any(is.na(parm$coef)) & rank.adjust) {
parm$coef[is.na(parm$coef)] <- 0
parm$beta[is.na(parm$beta)] <- 0
}
parm
}
estimice2 <- function(x, y, ls.meth = "qr", ridge = 1e-5, ...) {
p <- ncol(x)
df <- max(length(y) - p, 1)
if (ls.meth == "qr") {
qr <- lm.fit(x = x, y = y)
R <- qr.R(qr$qr)
if (qr$rank < p) {
pen <- ridge * diag(crossprod(R))
pen[1,1] <- 0 # not penalize intercept
qr <- lm.fit(x = rbind(x, pen), y = c(y, rep(0, p)))
qr <- .trim.qr.ridge(qr, p)
mess <- paste0(
"mice detected that your data are (nearly) multi-collinear.\n",
"It applied a ridge penalty to continue calculations, but the results can be unstable.\n",
"Does your dataset contain duplicates, linear transformation, or factors with unique respondent names?"
)
updateLog(out = mess, frame = 6)
if (get.printFlag()) {
cat("*")
}
}
sigma.star <- sqrt(c(crossprod(qr$residuals)) / rchisq(1, df))
}
list(
coef = qr$coefficients,
beta = qr$coefficients + c(crossprod(chol(chol2inv(R)), rnorm(p)) * sigma.star),
sigma = sigma.star,
estimation = ls.meth
)
}
.trim.qr.ridge <- function(qr, p) {
n <- seq_len(length(qr$residuals) - p)
qr$residuals <- qr$residuals[n]
qr$fitted.values <- qr$fitted.values[n]
qr$qr$qr <- qr$qr$qr[n, , drop = FALSE]
qr
}
Uh oh!
There was an error while loading. Please reload this page.