Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karthik/7e7875af0ecaa4327d3d61f550de94e0 to your computer and use it in GitHub Desktop.
Save karthik/7e7875af0ecaa4327d3d61f550de94e0 to your computer and use it in GitHub Desktop.
library('rcrossref')
library('tibble')
library('lubridate')
library('ggplot2')
library('dplyr')
library('readr')
library('tidyr')
# These next few lines of code create a set of start and end dates
# Then turns these into a list with pairs
from_pub_date <- seq(as.Date("2013-01-01"), length = 63, by = "months")
until_pub_date <- seq(as.Date("2013-02-01"), length = 63, by = "months") - 1
months <- data.frame(from_pub_date = as.character(from_pub_date), until_pub_date = as.character(until_pub_date), stringsAsFactors = FALSE)
xy.list <- split(months, seq(nrow(months)))
xy.list <- setNames(split(months, seq(nrow(months))), rownames(months))
xy.list
totals <- function(...) {
x <- cr_works(filter = list(...), .progress = "text")
return(x$meta$total_results)
}
# This function grabs metrics by month
get_data <- function(from_pub_date, until_pub_date) {
tot = totals(from_pub_date = from_pub_date, until_pub_date = until_pub_date)
has_funder = totals(from_pub_date = from_pub_date, until_pub_date = until_pub_date, has_funder = TRUE)
has_license = totals(from_pub_date = from_pub_date, until_pub_date = until_pub_date, has_license = TRUE)
has_orcid = totals(from_pub_date = from_pub_date, until_pub_date = until_pub_date, has_orcid = TRUE)
has_abstract = totals(from_pub_date = from_pub_date, until_pub_date = until_pub_date, has_abstract = TRUE)
tbl_df(data.frame(tot, has_funder, has_license, has_orcid, has_abstract))
}
# -------------------------------------------
results <- lapply(xy.list, function(x) { get_data(x$from_pub_date, x$until_pub_date) })
res <- bind_rows(results)
res$month <- as.Date(months$from_pub_date)
#These next few lines are to get the percentages
percentages <- res %>%
mutate(
percent_orcid = (has_orcid/tot) * 100,
percent_license = (has_license/tot) * 100,
percent_abstract = (has_abstract/tot) * 100,
percent_funder = (has_funder/tot) * 100
)
# For the sake of plotting, we lose the raw data and only keep percentages and the month column
perc <- percentages[, c(6:10)]
# This bit below collapses everything into a key value pair.
# So this means that as your number of measures grow, it grows rows, not columns making it easy to plot
caption = paste0("Crossref REST API & rOpenSci library. Retrieved ", lubridate::today())
p2 <- tidyr::gather(perc, key = "col", value = value, -month )
# Now we make the figure and save it to a variable so we can write it to disk
crossref_fig <- ggplot(p2, aes(month, value, fill = col)) + geom_bar(stat = "identity", position = "dodge") + scale_y_continuous(limits = c(0, 100)) + geom_hline(yintercept = 100, linetype = "dashed") +
labs(
title = "Crossref Metadata Coverage (%)",
subtitle = "2014 - present",
caption = caption,
x="Publication date (month)",
y="% of total works published each month"
) + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
# To specify legend display in a new variable
crossref_fig_updated <- crossref_fig + scale_fill_discrete(name="Metadata",
breaks=c("percent_license", "percent_funder", "percent_orcid", "percent_abstract"),
labels=c("License","Funding", "ORCID", "Abstract"))
# display plot
crossref_fig_updated
# Now we write the csv file to disk
write_csv(percentages, path = "~/Downloads/crossref_percentages.csv")
# Now we save the plot image
ggsave(crossref_fig_updated, file = "~/Downloads/crossref_fig.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment