Skip to content

Instantly share code, notes, and snippets.

@alexpghayes
Created May 8, 2025 17:02
Show Gist options
  • Save alexpghayes/f0041c74c68463e31be54818ee5160f5 to your computer and use it in GitHub Desktop.
Save alexpghayes/f0041c74c68463e31be54818ee5160f5 to your computer and use it in GitHub Desktop.
library(fastRG)
library(furrr)
library(irlba)
library(tidyverse)
library(wordspace)
plan(multisession, workers = 8)
# http://arxiv.org/abs/1407.0900 -- this also works for subspaces of different dimensions
subspace_loss <- function(u, v) {
# see [1] Vu and Lei 2013 section 2.3
# and [2] Rohe, Chatterjee, Yu 2011 Annals of Statistics page 1908
u <- normalize.cols(u, tol = 1e-10)
v <- normalize.cols(v, tol = 1e-10)
s <- svd(crossprod(u, v))
ncol(u) - sum(s$d^2)
}
# generate some data with correlation 0.5 to X, column-wise
# NOTE: EMPIRICAL correlation, not population correlation!
# https://stats.stackexchange.com/a/313138/99938
complement <- function(y, corr = 0.5) {
x <- rnorm(length(y))
y.perp <- residuals(lm(x ~ y))
corr * sd(y.perp) * y + y.perp * sd(y) * sqrt(1 - corr^2)
}
generate_traits <- function(U, corr, num_traits) {
k <- ncol(U)
traits <- matrix(0, nrow = nrow(U), ncol = num_traits)
for (trait in 1:num_traits) {
# repeat traits if num_traits > k
trait_index <- trait %% k + 1
u <- U[, trait_index, drop = TRUE]
traits[, trait] <- complement(u, corr = corr)
}
traits
}
single_ard_sin_theta_loss <- function(n, corr, num_traits = 5, k = 5) {
diagonal <- 0.5
off_diagonal <- 0.05
B <- matrix(off_diagonal, nrow = k, ncol = k)
diag(B) <- diagonal
pi <- rep(1, k) / k
theta <- rexp(n) + 1
A_model <- dcsbm(
theta = theta, # heterogeneous degrees
B = B,
pi = pi,
allow_self_loops = FALSE,
poisson_edges = FALSE,
expected_degree = 2 * n^0.6
)
s_pop <- svds(A_model)
traits <- generate_traits(s_pop$u, corr = corr, num_traits = num_traits)
A <- sample_sparse(A_model)
Y <- A %*% traits
s_ard <- svd(Y, k, k)
subspace_loss(s_pop$u, s_ard$u)
}
logspace <- function(min, max, num, base = 10) {
round(
base^seq(log(min, base = base), log(max, base = base), length.out = num)
)
}
results <- expand_grid(
n = logspace(100, 2000, 6),
rep = 1:30,
corr = c(0, 0.2, 0.4, 0.6, 0.8, 1),
num_traits = c(1, 3, 5, 7, 9, 11)
) |>
mutate(
loss = future_pmap_dbl(
list(n, corr, num_traits),
single_ard_sin_theta_loss,
.options = furrr_options(seed = TRUE)
)
)
ggplot(results) +
aes(
x = n,
y = loss,
color = as.factor(corr)
) +
geom_jitter(width = 0.1) +
geom_smooth() +
scale_x_log10(labels = scales::label_log(digits = 2)) +
scale_y_log10(labels = scales::label_log(digits = 2)) +
scale_color_viridis_d(begin = 0.15, end = 0.85) +
labs(
y = "Sin Theta Loss (log scale)",
x = "Number of nodes (log scale)",
color = "Correlation between\nlatent positions\nand traits (unobservable)",
title = "Estimation error when using ARD to estimate latent positions in a SBM (5 blocks)"
) +
facet_wrap(
facets = vars(num_traits),
labeller = labeller(
num_traits = c(
"1" = "Using 1 trait",
"3" = "Using 3 traits",
"5" = "Using 5 traits",
"7" = "Using 7 traits",
"9" = "Using 9 traits",
"11" = "Using 11 traits"
)
)
) +
theme_minimal(base_size = 16)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment