Created
October 24, 2024 21:56
-
-
Save alexpghayes/9ab8c37e75e5d91932e79037ba108371 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(fastadi) | |
library(here) | |
library(igraph) | |
library(Matrix) | |
library(rcrossref) | |
library(tidygraph) | |
library(tidyverse) | |
library(vsp) | |
# pull Ji and Jin 2016 data. see: | |
# | |
# 1. Ji, Pengsheng, and Jiashun Jin. “Coauthorship and Citation Networks for Statisticians.” The Annals of Applied Statistics 10, no. 4 (2016): 1779–1812. | |
# 2. Wang, Song, and Karl Rohe. “Discussion of ‘Coauthorship and Citation Networks for Statisticians.’” The Annals of Applied Statistics 10, no. 4 (2016): 1820–26. | |
# | |
# for context. i'm vaguely going to re-analyze in the vein of [2] here | |
source <- "https://www.stat.uga.edu/sites/default/files/psji/SCC2016-with-abs.zip" | |
tmp_zip <- tempfile() | |
tmp_fld <- tempdir() | |
download.file(source, tmp_zip) | |
unzip(tmp_zip, exdir = tmp_fld) | |
base_dir <- here(tmp_fld, "SCC2016-with-abs", "SCC2016", "Data") | |
citations <- read_table( | |
here(base_dir, "paperCitAdj.txt"), | |
col_names = FALSE | |
) |> | |
as.matrix() |> | |
Matrix() | |
papers <- read_csv( | |
here(base_dir, "paperList.txt") | |
) | |
rownames(citations) <- papers$DOI | |
colnames(citations) <- papers$DOI | |
tbl_graph <- citations |> | |
graph_from_adjacency_matrix() |> | |
as_tbl_graph() |> | |
left_join(papers, by = c("name" = "DOI")) |> | |
select(-citCounts) | |
# now we augment the original Ji and Jin 2016 with additional information about each paper, | |
# most importantly the date each paper was published | |
dois <- tbl_graph |> | |
as_tibble() |> | |
pull(name) | |
# set your email here to get access to faster API calls | |
# Sys.setenv(crossref_email = "[email protected]") | |
# takes a while but at least it has a progress bar | |
# | |
# NOTE: does not succeed for every paper, some information will be NA, this is fine | |
# | |
paper_details <- cr_works(dois, .progress = "text") | |
paper_crossref_df <- paper_details$data |> | |
select(-title, -abstract) # already have this from Ji and Jin data | |
citation_graph <- tbl_graph |> | |
left_join(paper_crossref_df, by = c("name" = "doi")) |> | |
mutate( | |
published = ymd(created) | |
) |> | |
select(-created, -deposited, -published.print, -issued, -published.online) |> | |
arrange(published) # important: order by publication date | |
# A[i, j] is the directed edge from node i to node j | |
A <- citation_graph |> | |
as_adj() | |
# sanity check that most citations are in the upper triangle, close enough, especially | |
# given that some date information is missing | |
sum(triu(A)) # 5152 in upper | |
sum(tril(A, -1)) # 570 in lower | |
# memory efficient clipping helper, avoid creating explicit zeroes in A | |
clip <- function(A, ell_z, ell_y) { | |
n <- nrow(A) | |
m <- as(A, "TsparseMatrix") | |
ind <- m@i < (n - ell_z) & m@j >= ell_y | |
m@x <- m@x[ind] | |
m@i <- m@i[ind] | |
m@j <- m@j[ind] | |
as(m, "CsparseMatrix") | |
} | |
# clipping parameters here set arbitrarily using n / 10, would | |
# explore other values | |
A_clipped <- clip(A, ell_z = 300, ell_y = 300) | |
sum(A) # 5722 edges | |
sum(A_clipped) # 5227 edges | |
# fastadi should report citations in upper and lower triangle, which | |
# hopefully confirms correctness here | |
mf <- citation_impute( | |
A_clipped, | |
rank = 15, # another parameter you need to determine, here i picked this based on [2] suggesting it should be fine | |
check_interval = 10, | |
max_iter = 50L # fine for EDA, use more like ~250 iter for final results if possible | |
) | |
# predict citations forward in time. this is memory inefficient but it works because | |
# the graph is small. for larger networks this step takes a lot of time and you can see | |
# | |
# https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/R/analysis-forward-citation.R and | |
# https://github.com/alexpghayes/citation-cofactoring-replication/blob/main/src/estimate-degrees.cpp | |
# | |
# for some computational speed ups | |
# | |
A_hat <- mf$u %*% diag(mf$d) %*% t(mf$v) | |
# predictions forward in time, should be zero for clipped rows/cols | |
tril(A_hat) | |
# if you want to varimax rotate the singular vector estimates | |
fa <- vsp(mf) | |
# vsp::get_y_hubs() and vsp::get_varimax_y() currently have bugs i need to fix, so use this instead | |
colnames(fa$Z) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank)) | |
rownames(fa$B) <- paste0("z", vsp:::left_padded_sequence(1:fa$rank)) | |
colnames(fa$Y) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank)) | |
colnames(fa$B) <- paste0("y", vsp:::left_padded_sequence(1:fa$rank)) | |
# y hubs (i would expect this to be nice) | |
y_hubs <- citation_graph |> | |
bind_varimax_y(fa) |> | |
as_tibble() |> | |
select(name, title, matches("y[0-9]+")) |> | |
pivot_longer( | |
matches("y[0-9]+"), | |
names_to = "factor", | |
values_to = "loading" | |
) |> | |
slice_max( | |
order_by = abs(loading), | |
n = 10, | |
with_ties = FALSE, | |
by = factor | |
) | |
View(y_hubs) | |
# z_hubs (less likely to be nice) | |
z_hubs <- citation_graph |> | |
bind_varimax_z(fa) |> | |
as_tibble() |> | |
select(name, title, matches("z[0-9]+")) |> | |
pivot_longer( | |
matches("z[0-9]+"), | |
names_to = "factor", | |
values_to = "loading" | |
) |> | |
slice_max( | |
order_by = abs(loading), | |
n = 10, | |
with_ties = FALSE, | |
by = factor | |
) | |
View(z_hubs) | |
# mixing matrix estimate B_hat looks mostly diagonal -- this is promising makes me think | |
# the clipping parameters are roughly reasonable | |
plot_mixing_matrix(fa) | |
# the abstracts are messy and need cleaning / good tokenization, which i am not going to | |
# worry about here, just going to demonstrate bff() for finding keywords instead | |
words <- citation_graph |> | |
as_tibble() |> | |
select(name, title) |> # replace `title` with `abstract`` to find keywords from abstracts instead | |
unnest_tokens(word, title) |> # also here | |
cast_sparse(name, word) | |
appearances <- colSums(words) | |
# only use words that appear at least five times | |
title_words <- words[, appearances > 5] | |
y_keywords <- bff(fa$Y, title_words, num_best = 7) | |
z_keywords <- bff(fa$Z, title_words, num_best = 7) | |
# examine keywords | |
y_keywords | |
z_keywords | |
# advice: play with ell_z and ell_y and rank until the hubs and keywords make sense qualitatively | |
# and B looks mostly diagonal. then play with A_hat |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment