Created
February 12, 2021 11:32
-
-
Save njahn82/a1dbc0249f2e892a3c5f781e50de959a to your computer and use it in GitHub Desktop.
Open APC R helper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Get most current Open APC snapshot | |
#' | |
#' Open APC shares several csv files via GitHub, which contain data about | |
#' institutional spending for open access articles. | |
#' | |
#' | |
#' @param open_apc_cols character vector representing Open APC collections. | |
#' See `open_apc_collections()` for an overview. | |
#' | |
#' @importFrom dplyr filter `%>%` inner_join | |
#' @importFrom purrr map_df | |
#' | |
#' @return Tibble with article-level subset of Open APC dataset including | |
#' collection. | |
#' | |
#' @export | |
open_apc_get <- function(open_apc_cols = NULL) { | |
req <- open_apc_collections() %>% | |
filter(open_apc_collection %in% open_apc_cols) %>% | |
.$gh_link | |
out <- purrr::map_df(req, open_apc_get_) | |
inner_join(out, open_apc_collections(), by = "gh_link") | |
} | |
#' Get single Open APC dataset using GitHub Data API | |
#' | |
#' @param gh_link URL to GitHub blob | |
#' | |
#' @importFrom readr read_csv | |
#' @importFrom dplyr select mutate | |
#' @importFrom tidyselect any_of | |
#' | |
#' @export | |
open_apc_get_ <- function(gh_link = NULL) { | |
gh_file(gh_link, to_disk = FALSE) %>% | |
read_csv() %>% | |
select(any_of(c("institution", "doi", "period", "euro", "is_hybrid", "agreement"))) %>% | |
mutate(gh_link = gh_link) | |
} | |
#' Open APC collections | |
#' | |
#' @importFrom tibble tribble | |
#' @export | |
open_apc_collections <- function() { | |
tibble::tribble( | |
~open_apc_collection, ~gh_link, | |
"main", "https://github.com/OpenAPC/openapc-de/blob/master/data/apc_de.csv", | |
"co-funding", "https://github.com/OpenAPC/openapc-de/blob/master/data/apc_cofunding.csv", | |
"transformative agreements", "https://github.com/OpenAPC/openapc-de/blob/master/data/transformative_agreements/transformative_agreements.csv", | |
"deal-wiley-optout", "https://github.com/OpenAPC/openapc-de/blob/master/data/transformative_agreements/deal_wiley_germany_opt_out/deal_wiley_germany_opt_out.csv" | |
) | |
} | |
#' Gets a file from a github repo, using the Data API blob endpoint | |
#' | |
#' This avoids the 1MB limit of the content API and uses [gh::gh] to deal with | |
#' authorization and such. See https://developer.github.com/v3/git/blobs/ | |
#' | |
#' @author Noam Ross | |
#' | |
#' @param url the URL of the file to download via API, of the form | |
#' `github.com/:owner/:repo/blob/:path | |
#' @param ref the reference of a commit: a branch name, tag, or commit SHA | |
#' @param owner,repo,path,ref alternate way to specify the file. These will | |
#' override values in `url` | |
#' @param to_disk,destfile write file to disk (default=TRUE)? If so, use the | |
#' name in `destfile`, or the original filename by default | |
#' @param .token,.api_url,.method,.limit,.send_headers arguments passed on to | |
#' [gh::gh] | |
#' @importFrom gh gh | |
#' @importFrom stringi stri_match_all_regex | |
#' @importFrom purrr %||% keep | |
#' @importFrom base64enc base64decode | |
#' @return Either the local path of the downloaded file (default), or a raw | |
#' vector | |
gh_file <- function(url = NULL, ref=NULL, | |
owner = NULL, repo = NULL, path = NULL, | |
to_disk=TRUE, destfile=NULL, | |
.token = NULL, .api_url= NULL, .method="GET", | |
.limit = NULL, .send_headers = NULL) { | |
if (!is.null(url)) { | |
matches <- stri_match_all_regex( | |
url, | |
"(github\\.com/)?([^\\/]+)/([^\\/]+)/[^\\/]+/([^\\/]+)/([^\\?]+)" | |
) | |
owner <- owner %||% matches[[1]][3] | |
repo <- repo %||% matches[[1]][4] | |
ref <- ref %||% matches[[1]][5] | |
path <- path %||% matches[[1]][6] | |
pathdir <- dirname(path) | |
pathfile <- basename(path) | |
} | |
dir_contents <- gh( | |
"/repos/:owner/:repo/contents/:path", | |
owner = owner, repo = repo, path = pathdir, ref = ref, | |
.token = NULL, .api_url = NULL, .method = "GET", | |
.limit = NULL, .send_headers = NULL | |
) | |
file_sha <- keep(dir_contents, ~ .$path == path)[[1]]$sha | |
blob <- gh( | |
"/repos/:owner/:repo/git/blobs/:sha", | |
owner = owner, repo = repo, sha = file_sha, | |
.token = NULL, .api_url = NULL, .method = "GET", | |
.limit = NULL, .send_headers = NULL | |
) | |
raw <- base64decode(blob[["content"]]) | |
if (to_disk) { | |
destfile <- destfile %||% pathfile | |
writeBin(raw, con = destfile) | |
return(destfile) | |
} else { | |
return(raw) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment