Last active
March 10, 2022 22:56
-
-
Save tomsing1/069bf71e7f2893174b49654db839e0cf to your computer and use it in GitHub Desktop.
Accessing ENA's REST APIs from R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(checkmate) | |
library(dplyr) | |
library(glue) | |
library(htmltidy) | |
library(httr) | |
library(purrr) | |
library(xml2) | |
# https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html | |
identify_accession_type <- function(accessions) { | |
checkmate::assert_character(accessions) | |
purrr::map_chr( | |
accessions, function(accession) { | |
accession_type <- dplyr::case_when( | |
grepl("^(E|D|S)RP[0-9]{6,}$", accession) ~ "secondary_study_accession", | |
grepl("^PRJ(E|D|N)[A-Z][0-9]+$", accession) ~ "study_accession", | |
grepl("^SAM(E|D|N)[A-Z]?[0-9]+$", accession) ~ "sample_accession", | |
grepl("^(E|D|S)RS[0-9]{6,}$", accession) ~ "secondary_sample_accession", | |
grepl("^(E|D|S)RX[0-9]{6,}$", accession) ~ "experiment_accession", | |
grepl("^(E|D|S)RR[0-9]{6,}$", accession) ~ "run_accession", | |
grepl("^GSM[0-9]+$", accession) ~ "sample_alias", | |
grepl("^GSE[0-9]+$", accession) ~ "study_alias", | |
TRUE ~ NA_character_ | |
) | |
if (is.na(accession_type)) { | |
stop(glue::glue("Sorry, {accession} does not look like a recognized ", | |
"identifier type.")) | |
} | |
return(accession_type) | |
}) | |
} | |
get_accessions <- function( | |
accessions, format = c("tsv", "json"), | |
fields = c("study_accession", "sample_accession", "experiment_accession", | |
"submission_accession")) { | |
format <- match.arg(format) | |
accession_type <- identify_accession_type(accessions) | |
r <- httr::GET(url = "https://www.ebi.ac.uk/ena/portal/api/search", | |
query = list( | |
result = "read_run", | |
query = glue::glue_collapse( | |
glue::glue("{accession_type}=\"{accessions}\""), | |
sep = " OR "), | |
fields = glue::glue_collapse(fields, sep = ","), | |
format = format | |
) | |
) | |
httr::stop_for_status(r) | |
if (format == "tsv") { | |
httr::content(r, as = "parsed", type = "text/tab-separated-values", | |
encoding = "UTF-8", show_col_types = FALSE) | |
} else { | |
httr::content(r, as = "parsed", type = "application/json", | |
encoding = "UTF-8") | |
} | |
} | |
get_records <- function(accessions) { | |
r <- httr::GET( | |
url = glue::glue( | |
"https://www.ebi.ac.uk/ena/browser/api/xml/", | |
"{glue::glue_collapse(accessions, sep = ',')}"), | |
) | |
httr::stop_for_status(r) | |
httr::content(r, as = "parsed", type = "text/xml", | |
encoding = "UTF-8") | |
} | |
#' Get a file report from the ENA Portal API | |
#' | |
#' File reports are available for the following accession types: | |
#' - Study accessions (ERP, SRP, DRP, PRJ prefixes) | |
#' - Experiment accessions (ERX, SRX, DRX prefixes) | |
#' - Sample accessions (ERS, SRS, DRS, SAM prefixes) | |
#' - Run accessions (ERR, SRR, DRR prefixes) | |
get_filereport <- function( | |
accession, | |
fields = c("accession", "center_name", "description", "experiment_accession", | |
"experiment_alias", "fastq_aspera", "fastq_ftp", | |
"instrument_model", "library_layout", "library_selection", | |
"library_source", "library_strategy","parent_study","read_count", | |
"sample_accession","sample_alias","sample_description", | |
"sample_title", "secondary_sample_accession", | |
"secondary_study_accession", "scientific_name", "study_accession", | |
"study_alias", "study_title", "submission_accession", "tax_id", | |
"tissue_type"), | |
format = c("tsv", "json")) { | |
format = match.arg(format) | |
# verify that the accession is valid | |
checkmate::assert_string(accession) | |
accession_type <- identify_accession_type(accession) | |
if (!accession_type %in% c("study_accession", "secondary_study_accession", | |
"sample_accession", "secondary_sample_accession", | |
"experiment_accession", "run_accession")) { | |
stop( | |
glue_collapse( | |
c("Sorry, file reports are only available for:", | |
" - Study accessions (ERP, SRP, DRP, PRJ prefixes)", | |
" - Experiment accessions (ERX, SRX, DRX prefixes)", | |
" - Sample accessions (ERS, SRS, DRS, SAM prefixes)", | |
" - Run accessions (ERR, SRR, DRR prefixes)"), sep = "\n") | |
) | |
} | |
r <- httr::GET( | |
url = "https://www.ebi.ac.uk/ena/portal/api/filereport", | |
query = list(accession = accession, | |
fields = glue_collapse(fields, sep = ","), | |
format = format, | |
result = "read_run") | |
) | |
httr::stop_for_status(r) | |
httr::content(r, as = "parsed", type = "text/tab-separated-values", | |
encoding = "UTF-8", show_col_types = FALSE) | |
} | |
# examples | |
identify_accession_type(c("SRS6107963", "SAMN14003257")) | |
res1 <- get_accessions("PRJNA552470") | |
res2 <- get_accessions("SRP212869") | |
dplyr::all_equal(res1, res2) | |
get_accessions("SRS6107963") | |
get_accessions("SRS6107963", format = "json") | |
get_accessions(c("SAMN12207420", "SAMN12207419")) | |
get_accessions(c("SRR11028503", "SAMN12207419")) | |
get_accessions("SRX7680900") | |
get_accessions("SAMN14003257") | |
get_accessions("GSE133758") | |
get_accessions("GSM4296704") | |
get_records("SRR11028503") | |
get_records(c("SAMN12207420", "SAMN12207419")) | |
get_filereport("SRS6107963") | |
get_filereport("SRR11028503") | |
get_filereport("SAMN12207420") | |
get_filereport("PRJNA552470") | |
# get_filereport("GSM4296704") # this accession type is not supported | |
# looking up studies by project id | |
get_records("PRJNA552470") %>% | |
xml2::as_list() %>% | |
purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "PRIMARY_ID") | |
get_records("PRJNA552470") %>% | |
xml2::as_list() %>% | |
purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "SECONDARY_ID") | |
# looking up the same study by study id returns a different xml document | |
get_records("SRP212869") %>% | |
xml2::as_list() %>% | |
purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "PRIMARY_ID") | |
get_records("SRP212869") %>% | |
xml2::as_list() %>% | |
purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "SECONDARY_ID") | |
# inspect the structure of a record | |
record <- get_records("SRR11028503") | |
xml2::xml_structure(record) | |
# to expand the xml object into a list | |
xml2::as_list(record) | |
# to visualize the xml file in RStudio's viewer pane | |
htmltidy::xml_view(record) | |
htmltidy::xml_tree_view(record) | |
# extracting elements using xpath queries | |
records <- get_records(c("SAMN12207420", "SAMN12207419")) | |
xml_find_all(records, xpath = "//PRIMARY_ID") | |
xml_text(xml_find_all(records, xpath = "//PRIMARY_ID")) | |
xml_attrs(xml_find_all(records, xpath = "//SAMPLE")) | |
xml_attr(xml_find_all(records, xpath = "//SAMPLE"), "accession") | |
# extract attributes as tag-value pairs from sample record(s) | |
records <- get_records(c("SAMN12207420", "SAMN12207419")) | |
samples <- xml_find_all(records, "//SAMPLE_SET/SAMPLE") | |
sample_attr <- map(samples, function(x) { | |
# we implicitely assume that tags and values always come in pairs | |
df <- data.frame( | |
tags = xml_text( | |
xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/TAG")), | |
values = xml_text( | |
xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE")) | |
) | |
df <- df[!grepl("^ENA-", df$tags), ] | |
}) | |
names(sample_attr) <- xml_text(xml_find_all(records, xpath = "//PRIMARY_ID")) | |
sample_attr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment