alexpghayes · October 24, 2024 21:56
diff --git a/fastadi-ji-and-jin-2016-demo.R b/fastadi-ji-and-jin-2016-demo.R
 library(fastadi)
 library(here)
 library(igraph)
 library(Matrix)
 library(rcrossref)
 library(tidygraph)
 library(tidyverse)
 library(vsp)

 # pull Ji and Jin 2016 data. see:
 #
 #   1. Ji, Pengsheng, and Jiashun Jin. “Coauthorship and Citation Networks for Statisticians.” The Annals of Applied Statistics 10, no. 4 (2016): 1779–1812.
 #   2. Wang, Song, and Karl Rohe. “Discussion of ‘Coauthorship and Citation Networks for Statisticians.’” The Annals of Applied Statistics 10, no. 4 (2016): 1820–26.
 #
 # for context. i'm vaguely going to re-analyze in the vein of [2] here

 source <- "https://www.stat.uga.edu/sites/default/files/psji/SCC2016-with-abs.zip"
 tmp_zip <- tempfile()
 tmp_fld <- tempdir()

 download.file(source, tmp_zip)
 unzip(tmp_zip, exdir = tmp_fld)

 base_dir <- here(tmp_fld, "SCC2016-with-abs", "SCC2016", "Data")

 citations <- read_table(
  here(base_dir, "paperCitAdj.txt"),
  col_names = FALSE
 ) |> 
  as.matrix() |> 
  Matrix()

 papers <- read_csv(
  here(base_dir, "paperList.txt")
 )

 rownames(citations) <- papers$DOI
 colnames(citations) <- papers$DOI

 tbl_graph <- citations |> 
  graph_from_adjacency_matrix() |> 
  as_tbl_graph() |> 
  left_join(papers, by = c("name" = "DOI")) |> 
  select(-citCounts)

 # now we augment the original Ji and Jin 2016 with additional information about each paper,
 # most importantly the date each paper was published

 dois <- tbl_graph |> 
  as_tibble() |> 
  pull(name)

 # set your email here to get access to faster API calls
 # Sys.setenv(crossref_email = "[email protected]")

 # takes a while but at least it has a progress bar
 # 
 #   NOTE: does not succeed for every paper, some information will be NA, this is fine
 #
 paper_details <- cr_works(dois, .progress = "text")

 paper_crossref_df <- paper_details$data |> 
  select(-title, -abstract)  # already have this from Ji and Jin data

 citation_graph <- tbl_graph |> 
  left_join(paper_crossref_df, by = c("name" = "doi")) |> 
  mutate(
    published = ymd(created)
  ) |> 
  select(-created, -deposited, -published.print, -issued, -published.online) |> 
  arrange(published)  # important: order by publication date

 # A[i, j] is the directed edge from node i to node j
 A <- citation_graph |> 
  as_adj()

 # sanity check that most citations are in the upper triangle, close enough, especially
 # given that some date information is missing
 sum(triu(A))      # 5152 in upper
 sum(tril(A, -1))  # 570 in lower

 # memory efficient clipping helper, avoid creating explicit zeroes in A
 clip <- function(A, ell_z, ell_y) {
  n <- nrow(A)
  m <- as(A, "TsparseMatrix")
  ind <- m@i < (n - ell_z) & m@j >= ell_y
  m@x <- m@x[ind]
  m@i <- m@i[ind]
  m@j <- m@j[ind]
  as(m, "CsparseMatrix")
 }

 # clipping parameters here set arbitrarily using n / 10, would
 # explore other values
 A_clipped <- clip(A, ell_z = 300, ell_y = 300)

 sum(A)          # 5722 edges
 sum(A_clipped)  # 5227 edges

 # fastadi should report citations in upper and lower triangle, which
 # hopefully confirms correctness here

 mf <- citation_impute(
  A_clipped,
  rank = 15,            # another parameter you need to determine, here i picked this based on [2] suggesting it should be fine
  check_interval = 10,
  max_iter = 50L        # fine for EDA, use more like ~250 iter for final results if possible
 )

 # predict citations forward in time. this is memory inefficient but it works because
 # the graph is small. for larger networks this step takes a lot of time and you can see
 #   
 #   https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/R/analysis-forward-citation.R and
 #   https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/src/estimate-degrees.cpp
 #
 # for some computational speed ups
 #
 A_hat <- mf$u %*% diag(mf$d) %*% t(mf$v)

 # predictions forward in time, should be zero for clipped rows/cols
 tril(A_hat)

 # if you want to varimax rotate the singular vector estimates
 fa <- vsp(mf)

 # vsp::get_y_hubs()  and vsp::get_varimax_y() currently have bugs i need to fix, so use this instead

 colnames(fa$Z) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank))
 rownames(fa$B) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank))
 colnames(fa$Y) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank))
 colnames(fa$B) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank))

 # y hubs (i would expect this to be nice)
 y_hubs <- citation_graph |> 
  bind_varimax_y(fa) |> 
  as_tibble() |> 
  select(name, title, matches("y[0-9]+")) |> 
  pivot_longer(
    matches("y[0-9]+"),
    names_to = "factor",
    values_to = "loading"
  ) |> 
  slice_max(
    order_by = abs(loading),
    n = 10,
    with_ties = FALSE,
    by = factor
  )

 View(y_hubs)

 # z_hubs (less likely to be nice)
 z_hubs <- citation_graph |> 
  bind_varimax_z(fa) |> 
  as_tibble() |> 
  select(name, title, matches("z[0-9]+")) |> 
  pivot_longer(
    matches("z[0-9]+"),
    names_to = "factor",
    values_to = "loading"
  ) |> 
  slice_max(
    order_by = abs(loading),
    n = 10,
    with_ties = FALSE,
    by = factor
  )

 View(z_hubs)

 # mixing matrix estimate B_hat looks mostly diagonal -- this is promising makes me think
 # the clipping parameters are roughly reasonable
 plot_mixing_matrix(fa)

 # the abstracts are messy and need cleaning / good tokenization, which i am not going to
 # worry about here, just going to demonstrate bff() for finding keywords instead

 words <- citation_graph |> 
  as_tibble() |> 
  select(name, title) |>           # replace `title` with `abstract`` to find keywords from abstracts instead
  unnest_tokens(word, title) |>    # also here
  cast_sparse(name, word)

 appearances <- colSums(words)

 # only use words that appear at least five times
 title_words <- words[, appearances > 5]

 y_keywords <- bff(fa$Y, title_words, num_best = 7)
 z_keywords <- bff(fa$Z, title_words, num_best = 7)

 # examine keywords
 y_keywords
 z_keywords

 # advice: play with ell_z and ell_y and rank until the hubs and keywords make sense qualitatively
 # and B looks mostly diagonal. then play with A_hat
	library(fastadi)
	library(here)
	library(igraph)
	library(Matrix)
	library(rcrossref)
	library(tidygraph)
	library(tidyverse)
	library(vsp)

	# pull Ji and Jin 2016 data. see:
	#
	# 1. Ji, Pengsheng, and Jiashun Jin. “Coauthorship and Citation Networks for Statisticians.” The Annals of Applied Statistics 10, no. 4 (2016): 1779–1812.
	# 2. Wang, Song, and Karl Rohe. “Discussion of ‘Coauthorship and Citation Networks for Statisticians.’” The Annals of Applied Statistics 10, no. 4 (2016): 1820–26.
	#
	# for context. i'm vaguely going to re-analyze in the vein of [2] here

	source <- "https://www.stat.uga.edu/sites/default/files/psji/SCC2016-with-abs.zip"
	tmp_zip <- tempfile()
	tmp_fld <- tempdir()

	download.file(source, tmp_zip)
	unzip(tmp_zip, exdir = tmp_fld)

	base_dir <- here(tmp_fld, "SCC2016-with-abs", "SCC2016", "Data")

	citations <- read_table(
	here(base_dir, "paperCitAdj.txt"),
	col_names = FALSE
	) \|>
	as.matrix() \|>
	Matrix()

	papers <- read_csv(
	here(base_dir, "paperList.txt")
	)

	rownames(citations) <- papers$DOI
	colnames(citations) <- papers$DOI

	tbl_graph <- citations \|>
	graph_from_adjacency_matrix() \|>
	as_tbl_graph() \|>
	left_join(papers, by = c("name" = "DOI")) \|>
	select(-citCounts)

	# now we augment the original Ji and Jin 2016 with additional information about each paper,
	# most importantly the date each paper was published

	dois <- tbl_graph \|>
	as_tibble() \|>
	pull(name)

	# set your email here to get access to faster API calls
	# Sys.setenv(crossref_email = "[email protected]")

	# takes a while but at least it has a progress bar
	#
	# NOTE: does not succeed for every paper, some information will be NA, this is fine
	#
	paper_details <- cr_works(dois, .progress = "text")

	paper_crossref_df <- paper_details$data \|>
	select(-title, -abstract) # already have this from Ji and Jin data

	citation_graph <- tbl_graph \|>
	left_join(paper_crossref_df, by = c("name" = "doi")) \|>
	mutate(
	published = ymd(created)
	) \|>
	select(-created, -deposited, -published.print, -issued, -published.online) \|>
	arrange(published) # important: order by publication date

	# A[i, j] is the directed edge from node i to node j
	A <- citation_graph \|>
	as_adj()

	# sanity check that most citations are in the upper triangle, close enough, especially
	# given that some date information is missing
	sum(triu(A)) # 5152 in upper
	sum(tril(A, -1)) # 570 in lower

	# memory efficient clipping helper, avoid creating explicit zeroes in A
	clip <- function(A, ell_z, ell_y) {
	n <- nrow(A)
	m <- as(A, "TsparseMatrix")
	ind <- m@i < (n - ell_z) & m@j >= ell_y
	m@x <- m@x[ind]
	m@i <- m@i[ind]
	m@j <- m@j[ind]
	as(m, "CsparseMatrix")
	}

	# clipping parameters here set arbitrarily using n / 10, would
	# explore other values
	A_clipped <- clip(A, ell_z = 300, ell_y = 300)

	sum(A) # 5722 edges
	sum(A_clipped) # 5227 edges

	# fastadi should report citations in upper and lower triangle, which
	# hopefully confirms correctness here

	mf <- citation_impute(
	A_clipped,
	rank = 15, # another parameter you need to determine, here i picked this based on [2] suggesting it should be fine
	check_interval = 10,
	max_iter = 50L # fine for EDA, use more like ~250 iter for final results if possible
	)

	# predict citations forward in time. this is memory inefficient but it works because
	# the graph is small. for larger networks this step takes a lot of time and you can see
	#
	# https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/R/analysis-forward-citation.R and
	# https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/src/estimate-degrees.cpp
	#
	# for some computational speed ups
	#
	A_hat <- mf$u %% diag(mf$d) %% t(mf$v)

	# predictions forward in time, should be zero for clipped rows/cols
	tril(A_hat)

	# if you want to varimax rotate the singular vector estimates
	fa <- vsp(mf)

	# vsp::get_y_hubs() and vsp::get_varimax_y() currently have bugs i need to fix, so use this instead

	colnames(fa$Z) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank))
	rownames(fa$B) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank))
	colnames(fa$Y) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank))
	colnames(fa$B) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank))

	# y hubs (i would expect this to be nice)
	y_hubs <- citation_graph \|>
	bind_varimax_y(fa) \|>
	as_tibble() \|>
	select(name, title, matches("y[0-9]+")) \|>
	pivot_longer(
	matches("y[0-9]+"),
	names_to = "factor",
	values_to = "loading"
	) \|>
	slice_max(
	order_by = abs(loading),
	n = 10,
	with_ties = FALSE,
	by = factor
	)

	View(y_hubs)

	# z_hubs (less likely to be nice)
	z_hubs <- citation_graph \|>
	bind_varimax_z(fa) \|>
	as_tibble() \|>
	select(name, title, matches("z[0-9]+")) \|>
	pivot_longer(
	matches("z[0-9]+"),
	names_to = "factor",
	values_to = "loading"
	) \|>
	slice_max(
	order_by = abs(loading),
	n = 10,
	with_ties = FALSE,
	by = factor
	)

	View(z_hubs)

	# mixing matrix estimate B_hat looks mostly diagonal -- this is promising makes me think
	# the clipping parameters are roughly reasonable
	plot_mixing_matrix(fa)

	# the abstracts are messy and need cleaning / good tokenization, which i am not going to
	# worry about here, just going to demonstrate bff() for finding keywords instead

	words <- citation_graph \|>
	as_tibble() \|>
	select(name, title) \|> # replace `title` with `abstract`` to find keywords from abstracts instead
	unnest_tokens(word, title) \|> # also here
	cast_sparse(name, word)

	appearances <- colSums(words)

	# only use words that appear at least five times
	title_words <- words[, appearances > 5]

	y_keywords <- bff(fa$Y, title_words, num_best = 7)
	z_keywords <- bff(fa$Z, title_words, num_best = 7)

	# examine keywords
	y_keywords
	z_keywords

	# advice: play with ell_z and ell_y and rank until the hubs and keywords make sense qualitatively
	# and B looks mostly diagonal. then play with A_hat