Last active
January 21, 2023 20:18
-
-
Save noamross/ca6906e32f897e3729de40610beffcd1 to your computer and use it in GitHub Desktop.
Testing reduced-rank missing data strategies with mgcv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Testing missing data strategies with mgcv | |
library(mgcv) | |
library(MRFtools) # github.com/noamross/MRFtools | |
library(tidyverse) | |
# Simulate data with missing parts | |
n <- 350;set.seed(2) | |
dat <- gamSim(1,n=n,scale=3) ## 1 or 7 | |
drop <- sample(1:n,300) ## to | |
for (i in 2:5) dat[drop[1:75+(i-2)*75],i] <- NA | |
dname <- names(dat)[2:5] | |
dat1 <- dat | |
for (i in 1:4) { | |
by.name <- paste("m",dname[i],sep="") | |
dat1[[by.name]] <- is.na(dat1[[dname[i]]]) | |
dat1[[dname[i]]][dat1[[by.name]]] <- mean(dat1[[dname[i]]],na.rm=TRUE) | |
lev <- rep(1,n);lev[dat1[[by.name]]] <- 1:sum(dat1[[by.name]]) | |
id.name <- paste("id",dname[i],sep="") | |
dat1[[id.name]] <- factor(lev) | |
dat1[[by.name]] <- as.numeric(dat1[[by.name]]) | |
} | |
# Fit a model per the ?mgcv::missing.data.example, modeling missing data as random effect levels | |
mod_re <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+ | |
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+ | |
s(idx0,bs="re",by=mx0)+s(idx1,bs="re",by=mx1)+ | |
s(idx2,bs="re",by=mx2)+s(idx3,bs="re",by=mx3) | |
,data=dat1,discrete=TRUE) | |
# Fit a model using "mrf" instead of "re" | |
p0 <- MRFtools::mrf_penalty(dat1$idx0, type = "individual") # Same as diag(nlevels(dat$idx0)) | |
p1 <- MRFtools::mrf_penalty(dat1$idx1, type = "individual") | |
p2 <- MRFtools::mrf_penalty(dat1$idx2, type = "individual") | |
p3 <- MRFtools::mrf_penalty(dat1$idx3, type = "individual") | |
mod_mrf_full <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+ | |
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+ | |
s(idx0,bs="mrf", xt = list(penalty = p0), by=mx0) + | |
s(idx1,bs="mrf", xt = list(penalty = p1), by=mx1) + | |
s(idx2,bs="mrf", xt = list(penalty = p2), by=mx2) + | |
s(idx3,bs="mrf", xt = list(penalty = p3), by=mx3) | |
,data=dat1,discrete=TRUE) | |
# Now again but used reduced-rank MRFs terms | |
mod_mrf_reduced <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+ | |
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+ | |
s(idx0,bs="mrf", xt = list(penalty = p0), by=mx0, k = 5) + | |
s(idx1,bs="mrf", xt = list(penalty = p1), by=mx1, k = 5) + | |
s(idx2,bs="mrf", xt = list(penalty = p2), by=mx2, k = 5) + | |
s(idx3,bs="mrf", xt = list(penalty = p3), by=mx3, k = 5) | |
,data=dat1,discrete=TRUE) | |
## fit the model to the `complete case' data... | |
mod_complete <- bam(y~s(x0)+s(x1)+s(x2)+s(x3),data=dat,method="REML") | |
# Plot all the model smooths for comparison | |
par(mfrow=c(4,4),mar=c(4,4,1,1)) | |
for (i in 1:4) plot(mod_re,select=i) ## plot the smooth effects from random-effects model | |
for (i in 1:4) plot(mod_mrf_full,select=i) ## plot the smooth effects from mrf model | |
for (i in 1:4) plot(mod_mrf_reduced,select=i) ## plot the smooth effects from reduced-rank mrf model | |
for (i in 1:4) plot(mod_complete,select=i) ## plot the smooth effects from the complete data model | |
models <- list( | |
mod_re = mod_re, | |
mod_mrf_full = mod_mrf_full, | |
mod_mrf_reduced = mod_mrf_reduced, | |
mod_complete = mod_complete | |
) | |
tibble( | |
model = names(models), | |
summary = map(models, summary), | |
n_coef = map_int(models, \(x) length(coef(x))), | |
r_sq = map_dbl(summary, "r.sq"), | |
dev_expl = map_dbl(summary, "dev.expl"), | |
sigma = map_dbl(models, "sig2") | |
) |> | |
bind_cols(map_dfr(models, \(x) {out <- as.data.frame(t(summary(x)$edf[1:4])); colnames(out) <- paste0("edf", 1:4); out})) | |
# How do predictions and performance compare? | |
par(mfrow = c(1,1)) | |
plot(predict(mod_re), predict(mod_mrf_reduced)) | |
cor(predict(mod_re), predict(mod_mrf_reduced)) | |
# But what if we only look at complete data? | |
cc <- complete.cases(dat) | |
plot(predict(mod_re)[cc], predict(mod_mrf_reduced)[cc]) | |
cor(predict(mod_re)[cc], predict(mod_mrf_reduced)[cc]) | |
# How about individual terms? | |
par(mfrow = c(2,2)) | |
plot( | |
predict(mod_re, type = "terms", terms = "s(x0):ordered(!mx0)TRUE")[cc,1], | |
predict(mod_mrf_reduced, type = "terms", terms = "s(x0):ordered(!mx0)TRUE")[cc,1] | |
) | |
plot( | |
predict(mod_re, type = "terms", terms = "s(x1):ordered(!mx1)TRUE")[cc,1], | |
predict(mod_mrf_reduced, type = "terms", terms = "s(x1):ordered(!mx1)TRUE")[cc,1] | |
) | |
plot( | |
predict(mod_re, type = "terms", terms = "s(x2):ordered(!mx2)TRUE")[cc,1], | |
predict(mod_mrf_reduced, type = "terms", terms = "s(x2):ordered(!mx2)TRUE")[cc,1] | |
) | |
plot( | |
predict(mod_re, type = "terms", terms = "s(x3):ordered(!mx3)TRUE")[cc,1], | |
predict(mod_mrf_reduced, type = "terms", terms = "s(x3):ordered(!mx3)TRUE")[cc,1] | |
) | |
# How do coefficients and variances compare? | |
par(mfrow = c(3,2)) | |
vc_re <- vcov(mod_re) | |
vc_mrf_reduced <- vcov(mod_mrf_reduced) | |
re_levels_0 <- stringi::stri_subset_fixed(rownames(vc_re), "s(idx0):") | |
mrf_levels_0 <- stringi::stri_subset_fixed(rownames(vc_mrf_reduced), "s(idx0):") | |
plot(coef(mod_re)[re_levels_0]) | |
plot(coef(mod_mrf_reduced)[mrf_levels_0]) | |
plot(sqrt(diag(vc_re[re_levels_0, re_levels_0]))) | |
plot(sqrt(diag(vc_mrf_reduced[mrf_levels_0, mrf_levels_0]))) | |
image(vc_re[re_levels_0, re_levels_0]) | |
image(vc_mrf_reduced[mrf_levels_0, mrf_levels_0]) | |
# How about looking at the variance component? | |
options(scipen = 10) | |
gam.vcomp(mod_re) | |
gam.vcomp(mod_mrf_full) | |
gam.vcomp(mod_mrf_reduced) | |
# Can we drop large penalty matrices from memory? | |
mod_mrf_reduced <- bam(y~s(x0,by=ordered(!mx0))+s(x1,by=ordered(!mx1))+ | |
s(x2,by=ordered(!mx2))+s(x3,by=ordered(!mx3))+ | |
s(idx0,bs="mrf", xt = list(penalty = p0), by=mx0, k = 5) + | |
s(idx1,bs="mrf", xt = list(penalty = p1), by=mx1, k = 5) + | |
s(idx2,bs="mrf", xt = list(penalty = p2), by=mx2, k = 5) + | |
s(idx3,bs="mrf", xt = list(penalty = p3), by=mx3, k = 5) | |
,data=dat1,discrete=TRUE, fit = FALSE) | |
for(i in 5:8) { | |
mod_mrf_reduced$smooth[[i]]$xt <- NULL | |
} | |
out <- bam(G=mod_mrf_reduced) | |
plot(out) | |
predict(out) | |
gam.check(out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment