Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Created August 31, 2024 09:11
Show Gist options
  • Save stephenturner/e1487c90a98e6d5805a3211f0140e198 to your computer and use it in GitHub Desktop.
Save stephenturner/e1487c90a98e6d5805a3211f0140e198 to your computer and use it in GitHub Desktop.
R code to query bioRxiv API for publication details
library(tidyverse)
library(httr2)
# Set URL path variables
url <- "https://api.biorxiv.org/"
what <- "details"
server <- "biorxiv"
date1 <- "2014-01-01"
date2 <- "2023-12-31"
# Create base request URL
basereq <-
request(url) |>
req_url_path_append(what) |>
req_url_path_append(server) |>
req_url_path_append(date1) |>
req_url_path_append(date2)
# Iterate through 100 publications at a time until there are no more
# Break out of the loop if there are no more pubs in the offset
cursor <- 0
responses <- list()
ok <- TRUE
while (ok) {
# Set up the request
req <-
basereq |>
req_url_path_append(cursor)
# Message what request/cursor you're on
message(cursor, " ", req$url)
# Get the response
resp <-
req |>
req_perform() |>
resp_body_json()
# See if the response was OK and you have items in the response
ok <- resp$messages[[1]]$status=="ok" && length(resp$collection)>0
# Break out if you don't
if (!ok) break
# Store the responses in a list
responses[[as.character(cursor)]] <- resp
# Increment the cursor
cursor <- cursor + 100L
}
# Turn that list into a tibble
rdf <-
responses |>
map(\(x) x$collection |> enframe() |> unnest_wider(col=value)) |>
bind_rows()
# Clean
rdf <-
rdf |>
select(-name, -jatsxml, -server) |>
mutate(date=as_date(date)) |>
mutate(year=year(date), .before=date) |>
mutate(across(where(is.character), trimws)) |>
mutate(category=stringr::str_to_title(category)) |>
mutate(across(where(is.character), \(x) na_if(x, "NA"))) |>
filter(category!="") |>
distinct() |>
arrange(date)
# Dedupe by DOI, only keeping latest version
rdf <-
rdf |>
arrange(doi, desc(date)) |>
distinct(doi, .keep_all=TRUE) |>
arrange(date)
table(table(rdf$doi))
# Dedupe by title, only keeping latest version
rdf <-
rdf |>
arrange(title, desc(date)) |>
distinct(title, .keep_all=TRUE) |>
arrange(date)
table(table(rdf$title))
# Dedupe by abstract, only keeping latest version
rdf <-
rdf |>
arrange(abstract, desc(date)) |>
distinct(abstract, .keep_all=TRUE) |>
arrange(date)
table(table(rdf$abstract))
# Clean up newlines
rdf <-
rdf |>
mutate(across(where(is.character),
\(x) gsub(pattern = "\\\\n|\\n", replacement = " --- ", x)))
# Summarize by year and category
rdfsum <-
rdf |>
count(year, category) |>
arrange(year, desc(n))
# Preprints by year per subject area
rdfsum |>
spread(category, n, fill = 0L) |>
select(-`Clinical Trials`) |>
arrange(year)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment