Skip to content

Instantly share code, notes, and snippets.

@alexpghayes
Created October 24, 2024 21:56
Show Gist options
  • Save alexpghayes/9ab8c37e75e5d91932e79037ba108371 to your computer and use it in GitHub Desktop.
Save alexpghayes/9ab8c37e75e5d91932e79037ba108371 to your computer and use it in GitHub Desktop.
library(fastadi)
library(here)
library(igraph)
library(Matrix)
library(rcrossref)
library(tidygraph)
library(tidyverse)
library(vsp)
# pull Ji and Jin 2016 data. see:
#
# 1. Ji, Pengsheng, and Jiashun Jin. “Coauthorship and Citation Networks for Statisticians.” The Annals of Applied Statistics 10, no. 4 (2016): 1779–1812.
# 2. Wang, Song, and Karl Rohe. “Discussion of ‘Coauthorship and Citation Networks for Statisticians.’” The Annals of Applied Statistics 10, no. 4 (2016): 1820–26.
#
# for context. i'm vaguely going to re-analyze in the vein of [2] here
source <- "https://www.stat.uga.edu/sites/default/files/psji/SCC2016-with-abs.zip"
tmp_zip <- tempfile()
tmp_fld <- tempdir()
download.file(source, tmp_zip)
unzip(tmp_zip, exdir = tmp_fld)
base_dir <- here(tmp_fld, "SCC2016-with-abs", "SCC2016", "Data")
citations <- read_table(
here(base_dir, "paperCitAdj.txt"),
col_names = FALSE
) |>
as.matrix() |>
Matrix()
papers <- read_csv(
here(base_dir, "paperList.txt")
)
rownames(citations) <- papers$DOI
colnames(citations) <- papers$DOI
tbl_graph <- citations |>
graph_from_adjacency_matrix() |>
as_tbl_graph() |>
left_join(papers, by = c("name" = "DOI")) |>
select(-citCounts)
# now we augment the original Ji and Jin 2016 with additional information about each paper,
# most importantly the date each paper was published
dois <- tbl_graph |>
as_tibble() |>
pull(name)
# set your email here to get access to faster API calls
# Sys.setenv(crossref_email = "[email protected]")
# takes a while but at least it has a progress bar
#
# NOTE: does not succeed for every paper, some information will be NA, this is fine
#
paper_details <- cr_works(dois, .progress = "text")
paper_crossref_df <- paper_details$data |>
select(-title, -abstract) # already have this from Ji and Jin data
citation_graph <- tbl_graph |>
left_join(paper_crossref_df, by = c("name" = "doi")) |>
mutate(
published = ymd(created)
) |>
select(-created, -deposited, -published.print, -issued, -published.online) |>
arrange(published) # important: order by publication date
# A[i, j] is the directed edge from node i to node j
A <- citation_graph |>
as_adj()
# sanity check that most citations are in the upper triangle, close enough, especially
# given that some date information is missing
sum(triu(A)) # 5152 in upper
sum(tril(A, -1)) # 570 in lower
# memory efficient clipping helper, avoid creating explicit zeroes in A
clip <- function(A, ell_z, ell_y) {
n <- nrow(A)
m <- as(A, "TsparseMatrix")
ind <- m@i < (n - ell_z) & m@j >= ell_y
m@x <- m@x[ind]
m@i <- m@i[ind]
m@j <- m@j[ind]
as(m, "CsparseMatrix")
}
# clipping parameters here set arbitrarily using n / 10, would
# explore other values
A_clipped <- clip(A, ell_z = 300, ell_y = 300)
sum(A) # 5722 edges
sum(A_clipped) # 5227 edges
# fastadi should report citations in upper and lower triangle, which
# hopefully confirms correctness here
mf <- citation_impute(
A_clipped,
rank = 15, # another parameter you need to determine, here i picked this based on [2] suggesting it should be fine
check_interval = 10,
max_iter = 50L # fine for EDA, use more like ~250 iter for final results if possible
)
# predict citations forward in time. this is memory inefficient but it works because
# the graph is small. for larger networks this step takes a lot of time and you can see
#
# https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/R/analysis-forward-citation.R and
# https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/src/estimate-degrees.cpp
#
# for some computational speed ups
#
A_hat <- mf$u %*% diag(mf$d) %*% t(mf$v)
# predictions forward in time, should be zero for clipped rows/cols
tril(A_hat)
# if you want to varimax rotate the singular vector estimates
fa <- vsp(mf)
# vsp::get_y_hubs() and vsp::get_varimax_y() currently have bugs i need to fix, so use this instead
colnames(fa$Z) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank))
rownames(fa$B) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank))
colnames(fa$Y) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank))
colnames(fa$B) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank))
# y hubs (i would expect this to be nice)
y_hubs <- citation_graph |>
bind_varimax_y(fa) |>
as_tibble() |>
select(name, title, matches("y[0-9]+")) |>
pivot_longer(
matches("y[0-9]+"),
names_to = "factor",
values_to = "loading"
) |>
slice_max(
order_by = abs(loading),
n = 10,
with_ties = FALSE,
by = factor
)
View(y_hubs)
# z_hubs (less likely to be nice)
z_hubs <- citation_graph |>
bind_varimax_z(fa) |>
as_tibble() |>
select(name, title, matches("z[0-9]+")) |>
pivot_longer(
matches("z[0-9]+"),
names_to = "factor",
values_to = "loading"
) |>
slice_max(
order_by = abs(loading),
n = 10,
with_ties = FALSE,
by = factor
)
View(z_hubs)
# mixing matrix estimate B_hat looks mostly diagonal -- this is promising makes me think
# the clipping parameters are roughly reasonable
plot_mixing_matrix(fa)
# the abstracts are messy and need cleaning / good tokenization, which i am not going to
# worry about here, just going to demonstrate bff() for finding keywords instead
words <- citation_graph |>
as_tibble() |>
select(name, title) |> # replace `title` with `abstract`` to find keywords from abstracts instead
unnest_tokens(word, title) |> # also here
cast_sparse(name, word)
appearances <- colSums(words)
# only use words that appear at least five times
title_words <- words[, appearances > 5]
y_keywords <- bff(fa$Y, title_words, num_best = 7)
z_keywords <- bff(fa$Z, title_words, num_best = 7)
# examine keywords
y_keywords
z_keywords
# advice: play with ell_z and ell_y and rank until the hubs and keywords make sense qualitatively
# and B looks mostly diagonal. then play with A_hat
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment