Skip to content

Instantly share code, notes, and snippets.

@tomsing1
Last active March 10, 2022 22:56
Show Gist options
  • Save tomsing1/069bf71e7f2893174b49654db839e0cf to your computer and use it in GitHub Desktop.
Save tomsing1/069bf71e7f2893174b49654db839e0cf to your computer and use it in GitHub Desktop.
Accessing ENA's REST APIs from R
library(checkmate)
library(dplyr)
library(glue)
library(htmltidy)
library(httr)
library(purrr)
library(xml2)
# https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html
identify_accession_type <- function(accessions) {
checkmate::assert_character(accessions)
purrr::map_chr(
accessions, function(accession) {
accession_type <- dplyr::case_when(
grepl("^(E|D|S)RP[0-9]{6,}$", accession) ~ "secondary_study_accession",
grepl("^PRJ(E|D|N)[A-Z][0-9]+$", accession) ~ "study_accession",
grepl("^SAM(E|D|N)[A-Z]?[0-9]+$", accession) ~ "sample_accession",
grepl("^(E|D|S)RS[0-9]{6,}$", accession) ~ "secondary_sample_accession",
grepl("^(E|D|S)RX[0-9]{6,}$", accession) ~ "experiment_accession",
grepl("^(E|D|S)RR[0-9]{6,}$", accession) ~ "run_accession",
grepl("^GSM[0-9]+$", accession) ~ "sample_alias",
grepl("^GSE[0-9]+$", accession) ~ "study_alias",
TRUE ~ NA_character_
)
if (is.na(accession_type)) {
stop(glue::glue("Sorry, {accession} does not look like a recognized ",
"identifier type."))
}
return(accession_type)
})
}
get_accessions <- function(
accessions, format = c("tsv", "json"),
fields = c("study_accession", "sample_accession", "experiment_accession",
"submission_accession")) {
format <- match.arg(format)
accession_type <- identify_accession_type(accessions)
r <- httr::GET(url = "https://www.ebi.ac.uk/ena/portal/api/search",
query = list(
result = "read_run",
query = glue::glue_collapse(
glue::glue("{accession_type}=\"{accessions}\""),
sep = " OR "),
fields = glue::glue_collapse(fields, sep = ","),
format = format
)
)
httr::stop_for_status(r)
if (format == "tsv") {
httr::content(r, as = "parsed", type = "text/tab-separated-values",
encoding = "UTF-8", show_col_types = FALSE)
} else {
httr::content(r, as = "parsed", type = "application/json",
encoding = "UTF-8")
}
}
get_records <- function(accessions) {
r <- httr::GET(
url = glue::glue(
"https://www.ebi.ac.uk/ena/browser/api/xml/",
"{glue::glue_collapse(accessions, sep = ',')}"),
)
httr::stop_for_status(r)
httr::content(r, as = "parsed", type = "text/xml",
encoding = "UTF-8")
}
#' Get a file report from the ENA Portal API
#'
#' File reports are available for the following accession types:
#' - Study accessions (ERP, SRP, DRP, PRJ prefixes)
#' - Experiment accessions (ERX, SRX, DRX prefixes)
#' - Sample accessions (ERS, SRS, DRS, SAM prefixes)
#' - Run accessions (ERR, SRR, DRR prefixes)
get_filereport <- function(
accession,
fields = c("accession", "center_name", "description", "experiment_accession",
"experiment_alias", "fastq_aspera", "fastq_ftp",
"instrument_model", "library_layout", "library_selection",
"library_source", "library_strategy","parent_study","read_count",
"sample_accession","sample_alias","sample_description",
"sample_title", "secondary_sample_accession",
"secondary_study_accession", "scientific_name", "study_accession",
"study_alias", "study_title", "submission_accession", "tax_id",
"tissue_type"),
format = c("tsv", "json")) {
format = match.arg(format)
# verify that the accession is valid
checkmate::assert_string(accession)
accession_type <- identify_accession_type(accession)
if (!accession_type %in% c("study_accession", "secondary_study_accession",
"sample_accession", "secondary_sample_accession",
"experiment_accession", "run_accession")) {
stop(
glue_collapse(
c("Sorry, file reports are only available for:",
" - Study accessions (ERP, SRP, DRP, PRJ prefixes)",
" - Experiment accessions (ERX, SRX, DRX prefixes)",
" - Sample accessions (ERS, SRS, DRS, SAM prefixes)",
" - Run accessions (ERR, SRR, DRR prefixes)"), sep = "\n")
)
}
r <- httr::GET(
url = "https://www.ebi.ac.uk/ena/portal/api/filereport",
query = list(accession = accession,
fields = glue_collapse(fields, sep = ","),
format = format,
result = "read_run")
)
httr::stop_for_status(r)
httr::content(r, as = "parsed", type = "text/tab-separated-values",
encoding = "UTF-8", show_col_types = FALSE)
}
# examples
identify_accession_type(c("SRS6107963", "SAMN14003257"))
res1 <- get_accessions("PRJNA552470")
res2 <- get_accessions("SRP212869")
dplyr::all_equal(res1, res2)
get_accessions("SRS6107963")
get_accessions("SRS6107963", format = "json")
get_accessions(c("SAMN12207420", "SAMN12207419"))
get_accessions(c("SRR11028503", "SAMN12207419"))
get_accessions("SRX7680900")
get_accessions("SAMN14003257")
get_accessions("GSE133758")
get_accessions("GSM4296704")
get_records("SRR11028503")
get_records(c("SAMN12207420", "SAMN12207419"))
get_filereport("SRS6107963")
get_filereport("SRR11028503")
get_filereport("SAMN12207420")
get_filereport("PRJNA552470")
# get_filereport("GSM4296704") # this accession type is not supported
# looking up studies by project id
get_records("PRJNA552470") %>%
xml2::as_list() %>%
purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "PRIMARY_ID")
get_records("PRJNA552470") %>%
xml2::as_list() %>%
purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "SECONDARY_ID")
# looking up the same study by study id returns a different xml document
get_records("SRP212869") %>%
xml2::as_list() %>%
purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "PRIMARY_ID")
get_records("SRP212869") %>%
xml2::as_list() %>%
purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "SECONDARY_ID")
# inspect the structure of a record
record <- get_records("SRR11028503")
xml2::xml_structure(record)
# to expand the xml object into a list
xml2::as_list(record)
# to visualize the xml file in RStudio's viewer pane
htmltidy::xml_view(record)
htmltidy::xml_tree_view(record)
# extracting elements using xpath queries
records <- get_records(c("SAMN12207420", "SAMN12207419"))
xml_find_all(records, xpath = "//PRIMARY_ID")
xml_text(xml_find_all(records, xpath = "//PRIMARY_ID"))
xml_attrs(xml_find_all(records, xpath = "//SAMPLE"))
xml_attr(xml_find_all(records, xpath = "//SAMPLE"), "accession")
# extract attributes as tag-value pairs from sample record(s)
records <- get_records(c("SAMN12207420", "SAMN12207419"))
samples <- xml_find_all(records, "//SAMPLE_SET/SAMPLE")
sample_attr <- map(samples, function(x) {
# we implicitely assume that tags and values always come in pairs
df <- data.frame(
tags = xml_text(
xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/TAG")),
values = xml_text(
xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE"))
)
df <- df[!grepl("^ENA-", df$tags), ]
})
names(sample_attr) <- xml_text(xml_find_all(records, xpath = "//PRIMARY_ID"))
sample_attr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment