Created
May 8, 2025 17:02
-
-
Save alexpghayes/f0041c74c68463e31be54818ee5160f5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(fastRG) | |
library(furrr) | |
library(irlba) | |
library(tidyverse) | |
library(wordspace) | |
plan(multisession, workers = 8) | |
# http://arxiv.org/abs/1407.0900 -- this also works for subspaces of different dimensions | |
subspace_loss <- function(u, v) { | |
# see [1] Vu and Lei 2013 section 2.3 | |
# and [2] Rohe, Chatterjee, Yu 2011 Annals of Statistics page 1908 | |
u <- normalize.cols(u, tol = 1e-10) | |
v <- normalize.cols(v, tol = 1e-10) | |
s <- svd(crossprod(u, v)) | |
ncol(u) - sum(s$d^2) | |
} | |
# generate some data with correlation 0.5 to X, column-wise | |
# NOTE: EMPIRICAL correlation, not population correlation! | |
# https://stats.stackexchange.com/a/313138/99938 | |
complement <- function(y, corr = 0.5) { | |
x <- rnorm(length(y)) | |
y.perp <- residuals(lm(x ~ y)) | |
corr * sd(y.perp) * y + y.perp * sd(y) * sqrt(1 - corr^2) | |
} | |
generate_traits <- function(U, corr, num_traits) { | |
k <- ncol(U) | |
traits <- matrix(0, nrow = nrow(U), ncol = num_traits) | |
for (trait in 1:num_traits) { | |
# repeat traits if num_traits > k | |
trait_index <- trait %% k + 1 | |
u <- U[, trait_index, drop = TRUE] | |
traits[, trait] <- complement(u, corr = corr) | |
} | |
traits | |
} | |
single_ard_sin_theta_loss <- function(n, corr, num_traits = 5, k = 5) { | |
diagonal <- 0.5 | |
off_diagonal <- 0.05 | |
B <- matrix(off_diagonal, nrow = k, ncol = k) | |
diag(B) <- diagonal | |
pi <- rep(1, k) / k | |
theta <- rexp(n) + 1 | |
A_model <- dcsbm( | |
theta = theta, # heterogeneous degrees | |
B = B, | |
pi = pi, | |
allow_self_loops = FALSE, | |
poisson_edges = FALSE, | |
expected_degree = 2 * n^0.6 | |
) | |
s_pop <- svds(A_model) | |
traits <- generate_traits(s_pop$u, corr = corr, num_traits = num_traits) | |
A <- sample_sparse(A_model) | |
Y <- A %*% traits | |
s_ard <- svd(Y, k, k) | |
subspace_loss(s_pop$u, s_ard$u) | |
} | |
logspace <- function(min, max, num, base = 10) { | |
round( | |
base^seq(log(min, base = base), log(max, base = base), length.out = num) | |
) | |
} | |
results <- expand_grid( | |
n = logspace(100, 2000, 6), | |
rep = 1:30, | |
corr = c(0, 0.2, 0.4, 0.6, 0.8, 1), | |
num_traits = c(1, 3, 5, 7, 9, 11) | |
) |> | |
mutate( | |
loss = future_pmap_dbl( | |
list(n, corr, num_traits), | |
single_ard_sin_theta_loss, | |
.options = furrr_options(seed = TRUE) | |
) | |
) | |
ggplot(results) + | |
aes( | |
x = n, | |
y = loss, | |
color = as.factor(corr) | |
) + | |
geom_jitter(width = 0.1) + | |
geom_smooth() + | |
scale_x_log10(labels = scales::label_log(digits = 2)) + | |
scale_y_log10(labels = scales::label_log(digits = 2)) + | |
scale_color_viridis_d(begin = 0.15, end = 0.85) + | |
labs( | |
y = "Sin Theta Loss (log scale)", | |
x = "Number of nodes (log scale)", | |
color = "Correlation between\nlatent positions\nand traits (unobservable)", | |
title = "Estimation error when using ARD to estimate latent positions in a SBM (5 blocks)" | |
) + | |
facet_wrap( | |
facets = vars(num_traits), | |
labeller = labeller( | |
num_traits = c( | |
"1" = "Using 1 trait", | |
"3" = "Using 3 traits", | |
"5" = "Using 5 traits", | |
"7" = "Using 7 traits", | |
"9" = "Using 9 traits", | |
"11" = "Using 11 traits" | |
) | |
) | |
) + | |
theme_minimal(base_size = 16) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment