stephenturner · August 31, 2024 09:11
diff --git a/biorxiv-api-httr2.R b/biorxiv-api-httr2.R
 library(tidyverse)
 library(httr2)

 # Set URL path variables
 url <- "https://api.biorxiv.org/"
 what <- "details"
 server <- "biorxiv"
 date1 <- "2014-01-01"
 date2 <- "2023-12-31"

 # Create base request URL
 basereq <-
  request(url) |>
  req_url_path_append(what) |>
  req_url_path_append(server) |>
  req_url_path_append(date1) |>
  req_url_path_append(date2)

 # Iterate through 100 publications at a time until there are no more
 # Break out of the loop if there are no more pubs in the offset
 cursor <- 0
 responses <- list()
 ok <- TRUE
 while (ok) {
  # Set up the request
  req <-
    basereq |>
    req_url_path_append(cursor)
  # Message what request/cursor you're on
  message(cursor, " ", req$url)
  # Get the response
  resp <-
    req |>
    req_perform() |>
    resp_body_json()
  # See if the response was OK and you have items in the response
  ok <- resp$messages[[1]]$status=="ok" && length(resp$collection)>0
  # Break out if you don't
  if (!ok) break
  # Store the responses in a list
  responses[[as.character(cursor)]] <- resp
  # Increment the cursor
  cursor <- cursor + 100L
 }

 # Turn that list into a tibble
 rdf <-
  responses |>
  map(\(x) x$collection |> enframe() |> unnest_wider(col=value)) |>
  bind_rows()

 # Clean
 rdf <-
  rdf |>
  select(-name, -jatsxml, -server) |>
  mutate(date=as_date(date)) |>
  mutate(year=year(date), .before=date) |>
  mutate(across(where(is.character), trimws)) |>
  mutate(category=stringr::str_to_title(category)) |>
  mutate(across(where(is.character), \(x) na_if(x, "NA"))) |>
  filter(category!="") |>
  distinct() |>
  arrange(date)

 # Dedupe by DOI, only keeping latest version
 rdf <-
  rdf |>
  arrange(doi, desc(date)) |>
  distinct(doi, .keep_all=TRUE) |>
  arrange(date)
 table(table(rdf$doi))

 # Dedupe by title, only keeping latest version
 rdf <-
  rdf |>
  arrange(title, desc(date)) |>
  distinct(title, .keep_all=TRUE) |>
  arrange(date)
 table(table(rdf$title))

 # Dedupe by abstract, only keeping latest version
 rdf <-
  rdf |>
  arrange(abstract, desc(date)) |>
  distinct(abstract, .keep_all=TRUE) |>
  arrange(date)
 table(table(rdf$abstract))

 # Clean up newlines
 rdf <-
  rdf |>
  mutate(across(where(is.character),
                \(x) gsub(pattern = "\\\\n|\\n", replacement = " --- ", x)))


 # Summarize by year and category
 rdfsum <-
  rdf |>
  count(year, category) |>
  arrange(year, desc(n))

 # Preprints by year per subject area
 rdfsum |>
  spread(category, n, fill = 0L) |>
  select(-`Clinical Trials`) |>
  arrange(year)
	library(tidyverse)
	library(httr2)

	# Set URL path variables
	url <- "https://api.biorxiv.org/"
	what <- "details"
	server <- "biorxiv"
	date1 <- "2014-01-01"
	date2 <- "2023-12-31"

	# Create base request URL
	basereq <-
	request(url) \|>
	req_url_path_append(what) \|>
	req_url_path_append(server) \|>
	req_url_path_append(date1) \|>
	req_url_path_append(date2)

	# Iterate through 100 publications at a time until there are no more
	# Break out of the loop if there are no more pubs in the offset
	cursor <- 0
	responses <- list()
	ok <- TRUE
	while (ok) {
	# Set up the request
	req <-
	basereq \|>
	req_url_path_append(cursor)
	# Message what request/cursor you're on
	message(cursor, " ", req$url)
	# Get the response
	resp <-
	req \|>
	req_perform() \|>
	resp_body_json()
	# See if the response was OK and you have items in the response
	ok <- resp$messages[[1]]$status=="ok" && length(resp$collection)>0
	# Break out if you don't
	if (!ok) break
	# Store the responses in a list
	responses[[as.character(cursor)]] <- resp
	# Increment the cursor
	cursor <- cursor + 100L
	}

	# Turn that list into a tibble
	rdf <-
	responses \|>
	map(\(x) x$collection \|> enframe() \|> unnest_wider(col=value)) \|>
	bind_rows()

	# Clean
	rdf <-
	rdf \|>
	select(-name, -jatsxml, -server) \|>
	mutate(date=as_date(date)) \|>
	mutate(year=year(date), .before=date) \|>
	mutate(across(where(is.character), trimws)) \|>
	mutate(category=stringr::str_to_title(category)) \|>
	mutate(across(where(is.character), \(x) na_if(x, "NA"))) \|>
	filter(category!="") \|>
	distinct() \|>
	arrange(date)

	# Dedupe by DOI, only keeping latest version
	rdf <-
	rdf \|>
	arrange(doi, desc(date)) \|>
	distinct(doi, .keep_all=TRUE) \|>
	arrange(date)
	table(table(rdf$doi))

	# Dedupe by title, only keeping latest version
	rdf <-
	rdf \|>
	arrange(title, desc(date)) \|>
	distinct(title, .keep_all=TRUE) \|>
	arrange(date)
	table(table(rdf$title))

	# Dedupe by abstract, only keeping latest version
	rdf <-
	rdf \|>
	arrange(abstract, desc(date)) \|>
	distinct(abstract, .keep_all=TRUE) \|>
	arrange(date)
	table(table(rdf$abstract))

	# Clean up newlines
	rdf <-
	rdf \|>
	mutate(across(where(is.character),
	\(x) gsub(pattern = "\\\\n\|\\n", replacement = " --- ", x)))


	# Summarize by year and category
	rdfsum <-
	rdf \|>
	count(year, category) \|>
	arrange(year, desc(n))

	# Preprints by year per subject area
	rdfsum \|>
	spread(category, n, fill = 0L) \|>
	select(-`Clinical Trials`) \|>
	arrange(year)