Created
August 31, 2024 09:11
-
-
Save stephenturner/e1487c90a98e6d5805a3211f0140e198 to your computer and use it in GitHub Desktop.
R code to query bioRxiv API for publication details
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(httr2) | |
# Set URL path variables | |
url <- "https://api.biorxiv.org/" | |
what <- "details" | |
server <- "biorxiv" | |
date1 <- "2014-01-01" | |
date2 <- "2023-12-31" | |
# Create base request URL | |
basereq <- | |
request(url) |> | |
req_url_path_append(what) |> | |
req_url_path_append(server) |> | |
req_url_path_append(date1) |> | |
req_url_path_append(date2) | |
# Iterate through 100 publications at a time until there are no more | |
# Break out of the loop if there are no more pubs in the offset | |
cursor <- 0 | |
responses <- list() | |
ok <- TRUE | |
while (ok) { | |
# Set up the request | |
req <- | |
basereq |> | |
req_url_path_append(cursor) | |
# Message what request/cursor you're on | |
message(cursor, " ", req$url) | |
# Get the response | |
resp <- | |
req |> | |
req_perform() |> | |
resp_body_json() | |
# See if the response was OK and you have items in the response | |
ok <- resp$messages[[1]]$status=="ok" && length(resp$collection)>0 | |
# Break out if you don't | |
if (!ok) break | |
# Store the responses in a list | |
responses[[as.character(cursor)]] <- resp | |
# Increment the cursor | |
cursor <- cursor + 100L | |
} | |
# Turn that list into a tibble | |
rdf <- | |
responses |> | |
map(\(x) x$collection |> enframe() |> unnest_wider(col=value)) |> | |
bind_rows() | |
# Clean | |
rdf <- | |
rdf |> | |
select(-name, -jatsxml, -server) |> | |
mutate(date=as_date(date)) |> | |
mutate(year=year(date), .before=date) |> | |
mutate(across(where(is.character), trimws)) |> | |
mutate(category=stringr::str_to_title(category)) |> | |
mutate(across(where(is.character), \(x) na_if(x, "NA"))) |> | |
filter(category!="") |> | |
distinct() |> | |
arrange(date) | |
# Dedupe by DOI, only keeping latest version | |
rdf <- | |
rdf |> | |
arrange(doi, desc(date)) |> | |
distinct(doi, .keep_all=TRUE) |> | |
arrange(date) | |
table(table(rdf$doi)) | |
# Dedupe by title, only keeping latest version | |
rdf <- | |
rdf |> | |
arrange(title, desc(date)) |> | |
distinct(title, .keep_all=TRUE) |> | |
arrange(date) | |
table(table(rdf$title)) | |
# Dedupe by abstract, only keeping latest version | |
rdf <- | |
rdf |> | |
arrange(abstract, desc(date)) |> | |
distinct(abstract, .keep_all=TRUE) |> | |
arrange(date) | |
table(table(rdf$abstract)) | |
# Clean up newlines | |
rdf <- | |
rdf |> | |
mutate(across(where(is.character), | |
\(x) gsub(pattern = "\\\\n|\\n", replacement = " --- ", x))) | |
# Summarize by year and category | |
rdfsum <- | |
rdf |> | |
count(year, category) |> | |
arrange(year, desc(n)) | |
# Preprints by year per subject area | |
rdfsum |> | |
spread(category, n, fill = 0L) |> | |
select(-`Clinical Trials`) |> | |
arrange(year) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment