alexpghayes · May 8, 2025 17:02
diff --git a/ard-rdpg-sims-corr.R b/ard-rdpg-sims-corr.R
 library(fastRG)
 library(furrr)
 library(irlba)
 library(tidyverse)
 library(wordspace)

 plan(multisession, workers = 8)

 # http://arxiv.org/abs/1407.0900 -- this also works for subspaces of different dimensions
 subspace_loss <- function(u, v) {
  # see [1] Vu and Lei 2013 section 2.3
  # and [2] Rohe, Chatterjee, Yu 2011 Annals of Statistics page 1908

  u <- normalize.cols(u, tol = 1e-10)
  v <- normalize.cols(v, tol = 1e-10)

  s <- svd(crossprod(u, v))
  ncol(u) - sum(s$d^2)
 }

 # generate some data with correlation 0.5 to X, column-wise
 # NOTE: EMPIRICAL correlation, not population correlation!
 # https://stats.stackexchange.com/a/313138/99938
 complement <- function(y, corr = 0.5) {
  x <- rnorm(length(y))
  y.perp <- residuals(lm(x ~ y))
  corr * sd(y.perp) * y + y.perp * sd(y) * sqrt(1 - corr^2)
 }

 generate_traits <- function(U, corr, num_traits) {
  k <- ncol(U)
  traits <- matrix(0, nrow = nrow(U), ncol = num_traits)
  for (trait in 1:num_traits) {
    # repeat traits if num_traits > k
    trait_index <- trait %% k + 1
    u <- U[, trait_index, drop = TRUE]
    traits[, trait] <- complement(u, corr = corr)
  }
  traits
 }

 single_ard_sin_theta_loss <- function(n, corr, num_traits = 5, k = 5) {
  diagonal <- 0.5
  off_diagonal <- 0.05

  B <- matrix(off_diagonal, nrow = k, ncol = k)
  diag(B) <- diagonal

  pi <- rep(1, k) / k

  theta <- rexp(n) + 1

  A_model <- dcsbm(
    theta = theta, # heterogeneous degrees
    B = B,
    pi = pi,
    allow_self_loops = FALSE,
    poisson_edges = FALSE,
    expected_degree = 2 * n^0.6
  )

  s_pop <- svds(A_model)

  traits <- generate_traits(s_pop$u, corr = corr, num_traits = num_traits)

  A <- sample_sparse(A_model)
  Y <- A %*% traits

  s_ard <- svd(Y, k, k)
  subspace_loss(s_pop$u, s_ard$u)
 }


 logspace <- function(min, max, num, base = 10) {
  round(
    base^seq(log(min, base = base), log(max, base = base), length.out = num)
  )
 }

 results <- expand_grid(
  n = logspace(100, 2000, 6),
  rep = 1:30,
  corr = c(0, 0.2, 0.4, 0.6, 0.8, 1),
  num_traits = c(1, 3, 5, 7, 9, 11)
 ) |>
  mutate(
    loss = future_pmap_dbl(
      list(n, corr, num_traits),
      single_ard_sin_theta_loss,
      .options = furrr_options(seed = TRUE)
    )
  )

 ggplot(results) +
  aes(
    x = n,
    y = loss,
    color = as.factor(corr)
  ) +
  geom_jitter(width = 0.1) +
  geom_smooth() +
  scale_x_log10(labels = scales::label_log(digits = 2)) +
  scale_y_log10(labels = scales::label_log(digits = 2)) +
  scale_color_viridis_d(begin = 0.15, end = 0.85) +
  labs(
    y = "Sin Theta Loss (log scale)",
    x = "Number of nodes (log scale)",
    color = "Correlation between\nlatent positions\nand traits (unobservable)",
    title = "Estimation error when using ARD to estimate latent positions in a SBM (5 blocks)"
  ) +
  facet_wrap(
    facets = vars(num_traits),
    labeller = labeller(
      num_traits = c(
        "1" = "Using 1 trait",
        "3" = "Using 3 traits",
        "5" = "Using 5 traits",
        "7" = "Using 7 traits",
        "9" = "Using 9 traits",
        "11" = "Using 11 traits"
      )
    )
  ) +
  theme_minimal(base_size = 16)
	library(fastRG)
	library(furrr)
	library(irlba)
	library(tidyverse)
	library(wordspace)

	plan(multisession, workers = 8)

	# http://arxiv.org/abs/1407.0900 -- this also works for subspaces of different dimensions
	subspace_loss <- function(u, v) {
	# see [1] Vu and Lei 2013 section 2.3
	# and [2] Rohe, Chatterjee, Yu 2011 Annals of Statistics page 1908

	u <- normalize.cols(u, tol = 1e-10)
	v <- normalize.cols(v, tol = 1e-10)

	s <- svd(crossprod(u, v))
	ncol(u) - sum(s$d^2)
	}

	# generate some data with correlation 0.5 to X, column-wise
	# NOTE: EMPIRICAL correlation, not population correlation!
	# https://stats.stackexchange.com/a/313138/99938
	complement <- function(y, corr = 0.5) {
	x <- rnorm(length(y))
	y.perp <- residuals(lm(x ~ y))
	corr * sd(y.perp) * y + y.perp * sd(y) * sqrt(1 - corr^2)
	}

	generate_traits <- function(U, corr, num_traits) {
	k <- ncol(U)
	traits <- matrix(0, nrow = nrow(U), ncol = num_traits)
	for (trait in 1:num_traits) {
	# repeat traits if num_traits > k
	trait_index <- trait %% k + 1
	u <- U[, trait_index, drop = TRUE]
	traits[, trait] <- complement(u, corr = corr)
	}
	traits
	}

	single_ard_sin_theta_loss <- function(n, corr, num_traits = 5, k = 5) {
	diagonal <- 0.5
	off_diagonal <- 0.05

	B <- matrix(off_diagonal, nrow = k, ncol = k)
	diag(B) <- diagonal

	pi <- rep(1, k) / k

	theta <- rexp(n) + 1

	A_model <- dcsbm(
	theta = theta, # heterogeneous degrees
	B = B,
	pi = pi,
	allow_self_loops = FALSE,
	poisson_edges = FALSE,
	expected_degree = 2 * n^0.6
	)

	s_pop <- svds(A_model)

	traits <- generate_traits(s_pop$u, corr = corr, num_traits = num_traits)

	A <- sample_sparse(A_model)
	Y <- A %*% traits

	s_ard <- svd(Y, k, k)
	subspace_loss(s_pop$u, s_ard$u)
	}


	logspace <- function(min, max, num, base = 10) {
	round(
	base^seq(log(min, base = base), log(max, base = base), length.out = num)
	)
	}

	results <- expand_grid(
	n = logspace(100, 2000, 6),
	rep = 1:30,
	corr = c(0, 0.2, 0.4, 0.6, 0.8, 1),
	num_traits = c(1, 3, 5, 7, 9, 11)
	) \|>
	mutate(
	loss = future_pmap_dbl(
	list(n, corr, num_traits),
	single_ard_sin_theta_loss,
	.options = furrr_options(seed = TRUE)
	)
	)

	ggplot(results) +
	aes(
	x = n,
	y = loss,
	color = as.factor(corr)
	) +
	geom_jitter(width = 0.1) +
	geom_smooth() +
	scale_x_log10(labels = scales::label_log(digits = 2)) +
	scale_y_log10(labels = scales::label_log(digits = 2)) +
	scale_color_viridis_d(begin = 0.15, end = 0.85) +
	labs(
	y = "Sin Theta Loss (log scale)",
	x = "Number of nodes (log scale)",
	color = "Correlation between\nlatent positions\nand traits (unobservable)",
	title = "Estimation error when using ARD to estimate latent positions in a SBM (5 blocks)"
	) +
	facet_wrap(
	facets = vars(num_traits),
	labeller = labeller(
	num_traits = c(
	"1" = "Using 1 trait",
	"3" = "Using 3 traits",
	"5" = "Using 5 traits",
	"7" = "Using 7 traits",
	"9" = "Using 9 traits",
	"11" = "Using 11 traits"
	)
	)
	) +
	theme_minimal(base_size = 16)