tomsing1 · March 10, 2022 22:56
diff --git a/ena_rest.R b/ena_rest.R
 library(checkmate)
 library(dplyr)
 library(glue)
 library(htmltidy)
 library(httr)
 library(purrr)
 library(xml2)

 # https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html
 identify_accession_type <- function(accessions) {
  checkmate::assert_character(accessions)
  purrr::map_chr(
    accessions, function(accession) {
      accession_type <- dplyr::case_when(
        grepl("^(E|D|S)RP[0-9]{6,}$", accession) ~ "secondary_study_accession",
        grepl("^PRJ(E|D|N)[A-Z][0-9]+$", accession) ~ "study_accession",
        grepl("^SAM(E|D|N)[A-Z]?[0-9]+$", accession) ~ "sample_accession",
        grepl("^(E|D|S)RS[0-9]{6,}$", accession) ~ "secondary_sample_accession",
        grepl("^(E|D|S)RX[0-9]{6,}$", accession) ~ "experiment_accession",
        grepl("^(E|D|S)RR[0-9]{6,}$", accession) ~ "run_accession",
        grepl("^GSM[0-9]+$", accession) ~ "sample_alias",
        grepl("^GSE[0-9]+$", accession) ~ "study_alias",
        TRUE ~ NA_character_
      )
      if (is.na(accession_type)) {
        stop(glue::glue("Sorry, {accession} does not look like a recognized ", 
                        "identifier type."))
      }
      return(accession_type)
    })
 }

 get_accessions <- function(
  accessions, format = c("tsv", "json"),
  fields = c("study_accession", "sample_accession", "experiment_accession", 
              "submission_accession")) {
  format <- match.arg(format)
  accession_type <- identify_accession_type(accessions)
  
  r <- httr::GET(url = "https://www.ebi.ac.uk/ena/portal/api/search",
                 query = list(
                   result = "read_run",
                   query = glue::glue_collapse(
                     glue::glue("{accession_type}=\"{accessions}\""),
                     sep = " OR "),
                   fields = glue::glue_collapse(fields, sep = ","),
                   format = format
                 )
  )
  httr::stop_for_status(r)
  if (format == "tsv") {
    httr::content(r, as = "parsed", type = "text/tab-separated-values", 
                  encoding = "UTF-8", show_col_types = FALSE)
  } else {
    httr::content(r, as = "parsed", type = "application/json", 
                  encoding = "UTF-8")
  }
 }

 get_records <- function(accessions) {
  r <- httr::GET(
    url = glue::glue(
      "https://www.ebi.ac.uk/ena/browser/api/xml/",
      "{glue::glue_collapse(accessions, sep = ',')}"), 
  )
  httr::stop_for_status(r)
  httr::content(r, as = "parsed", type = "text/xml", 
                encoding = "UTF-8")
  
 }

 #' Get a file report from the ENA Portal API
 #' 
 #' File reports are available for the following accession types:
 #' - Study accessions (ERP, SRP, DRP, PRJ prefixes)
 #' - Experiment accessions (ERX, SRX, DRX prefixes)
 #' - Sample accessions (ERS, SRS, DRS, SAM prefixes)
 #' - Run accessions (ERR, SRR, DRR prefixes)
 get_filereport <- function(
  accession,
  fields = c("accession", "center_name", "description", "experiment_accession",
             "experiment_alias", "fastq_aspera", "fastq_ftp",
             "instrument_model", "library_layout", "library_selection",
             "library_source", "library_strategy","parent_study","read_count",
             "sample_accession","sample_alias","sample_description",
             "sample_title", "secondary_sample_accession", 
             "secondary_study_accession", "scientific_name", "study_accession",
             "study_alias", "study_title", "submission_accession", "tax_id",
             "tissue_type"),
  format = c("tsv", "json")) {
  format = match.arg(format)
  
  # verify that the accession is valid
  checkmate::assert_string(accession)
  accession_type <- identify_accession_type(accession)
  if (!accession_type %in% c("study_accession", "secondary_study_accession",
                             "sample_accession", "secondary_sample_accession",
                             "experiment_accession", "run_accession")) {
    stop(
    glue_collapse(
      c("Sorry, file reports are only available for:",
        "  - Study accessions (ERP, SRP, DRP, PRJ prefixes)",
        "  - Experiment accessions (ERX, SRX, DRX prefixes)",
        "  - Sample accessions (ERS, SRS, DRS, SAM prefixes)",
        "  - Run accessions (ERR, SRR, DRR prefixes)"), sep = "\n")
    )
  }
  
    r <- httr::GET(
      url = "https://www.ebi.ac.uk/ena/portal/api/filereport",
      query = list(accession = accession,
                   fields = glue_collapse(fields, sep = ","),
                            format = format,
                   result = "read_run")
  )
  httr::stop_for_status(r)
  httr::content(r, as = "parsed", type = "text/tab-separated-values", 
                encoding = "UTF-8", show_col_types = FALSE)
 }
 # examples
 identify_accession_type(c("SRS6107963", "SAMN14003257"))

 res1 <- get_accessions("PRJNA552470")
 res2 <- get_accessions("SRP212869")
 dplyr::all_equal(res1, res2)

 get_accessions("SRS6107963")
 get_accessions("SRS6107963", format = "json")
 get_accessions(c("SAMN12207420", "SAMN12207419"))
 get_accessions(c("SRR11028503", "SAMN12207419"))
 get_accessions("SRX7680900")
 get_accessions("SAMN14003257")
 get_accessions("GSE133758")
 get_accessions("GSM4296704")

 get_records("SRR11028503")
 get_records(c("SAMN12207420", "SAMN12207419"))

 get_filereport("SRS6107963")
 get_filereport("SRR11028503")
 get_filereport("SAMN12207420")
 get_filereport("PRJNA552470")
 # get_filereport("GSM4296704")  # this accession type is not supported

 # looking up studies by project id
 get_records("PRJNA552470") %>%
  xml2::as_list() %>%
  purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "PRIMARY_ID")
 get_records("PRJNA552470") %>%
  xml2::as_list() %>%
  purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "SECONDARY_ID")

 # looking up the same study by study id returns a different xml document
 get_records("SRP212869") %>% 
  xml2::as_list() %>%
  purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "PRIMARY_ID")
 get_records("SRP212869") %>% 
  xml2::as_list() %>%
  purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "SECONDARY_ID")

 # inspect the structure of a record
 record <- get_records("SRR11028503")
 xml2::xml_structure(record)

 # to expand the xml object into a list
 xml2::as_list(record)

 # to visualize the xml file in RStudio's viewer pane
 htmltidy::xml_view(record)
 htmltidy::xml_tree_view(record)

 # extracting elements using xpath queries
 records <- get_records(c("SAMN12207420", "SAMN12207419"))
 xml_find_all(records, xpath = "//PRIMARY_ID")
 xml_text(xml_find_all(records, xpath = "//PRIMARY_ID"))
 xml_attrs(xml_find_all(records, xpath = "//SAMPLE"))
 xml_attr(xml_find_all(records, xpath = "//SAMPLE"), "accession")

 # extract attributes as tag-value pairs from sample record(s)
 records <- get_records(c("SAMN12207420", "SAMN12207419"))
 samples <- xml_find_all(records, "//SAMPLE_SET/SAMPLE")
 sample_attr <- map(samples, function(x) {
  # we implicitely assume that tags and values always come in pairs
  df <- data.frame(
    tags = xml_text(
      xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/TAG")),
    values = xml_text(
      xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE"))
  )
  df <- df[!grepl("^ENA-", df$tags), ]
 })
 names(sample_attr) <- xml_text(xml_find_all(records, xpath = "//PRIMARY_ID"))
 sample_attr
	library(checkmate)
	library(dplyr)
	library(glue)
	library(htmltidy)
	library(httr)
	library(purrr)
	library(xml2)

	# https://ena-docs.readthedocs.io/en/latest/submit/general-guide/accessions.html
	identify_accession_type <- function(accessions) {
	checkmate::assert_character(accessions)
	purrr::map_chr(
	accessions, function(accession) {
	accession_type <- dplyr::case_when(
	grepl("^(E\|D\|S)RP[0-9]{6,}$", accession) ~ "secondary_study_accession",
	grepl("^PRJ(E\|D\|N)[A-Z][0-9]+$", accession) ~ "study_accession",
	grepl("^SAM(E\|D\|N)[A-Z]?[0-9]+$", accession) ~ "sample_accession",
	grepl("^(E\|D\|S)RS[0-9]{6,}$", accession) ~ "secondary_sample_accession",
	grepl("^(E\|D\|S)RX[0-9]{6,}$", accession) ~ "experiment_accession",
	grepl("^(E\|D\|S)RR[0-9]{6,}$", accession) ~ "run_accession",
	grepl("^GSM[0-9]+$", accession) ~ "sample_alias",
	grepl("^GSE[0-9]+$", accession) ~ "study_alias",
	TRUE ~ NA_character_
	)
	if (is.na(accession_type)) {
	stop(glue::glue("Sorry, {accession} does not look like a recognized ",
	"identifier type."))
	}
	return(accession_type)
	})
	}

	get_accessions <- function(
	accessions, format = c("tsv", "json"),
	fields = c("study_accession", "sample_accession", "experiment_accession",
	"submission_accession")) {
	format <- match.arg(format)
	accession_type <- identify_accession_type(accessions)

	r <- httr::GET(url = "https://www.ebi.ac.uk/ena/portal/api/search",
	query = list(
	result = "read_run",
	query = glue::glue_collapse(
	glue::glue("{accession_type}=\"{accessions}\""),
	sep = " OR "),
	fields = glue::glue_collapse(fields, sep = ","),
	format = format
	)
	)
	httr::stop_for_status(r)
	if (format == "tsv") {
	httr::content(r, as = "parsed", type = "text/tab-separated-values",
	encoding = "UTF-8", show_col_types = FALSE)
	} else {
	httr::content(r, as = "parsed", type = "application/json",
	encoding = "UTF-8")
	}
	}

	get_records <- function(accessions) {
	r <- httr::GET(
	url = glue::glue(
	"https://www.ebi.ac.uk/ena/browser/api/xml/",
	"{glue::glue_collapse(accessions, sep = ',')}"),
	)
	httr::stop_for_status(r)
	httr::content(r, as = "parsed", type = "text/xml",
	encoding = "UTF-8")

	}

	#' Get a file report from the ENA Portal API
	#'
	#' File reports are available for the following accession types:
	#' - Study accessions (ERP, SRP, DRP, PRJ prefixes)
	#' - Experiment accessions (ERX, SRX, DRX prefixes)
	#' - Sample accessions (ERS, SRS, DRS, SAM prefixes)
	#' - Run accessions (ERR, SRR, DRR prefixes)
	get_filereport <- function(
	accession,
	fields = c("accession", "center_name", "description", "experiment_accession",
	"experiment_alias", "fastq_aspera", "fastq_ftp",
	"instrument_model", "library_layout", "library_selection",
	"library_source", "library_strategy","parent_study","read_count",
	"sample_accession","sample_alias","sample_description",
	"sample_title", "secondary_sample_accession",
	"secondary_study_accession", "scientific_name", "study_accession",
	"study_alias", "study_title", "submission_accession", "tax_id",
	"tissue_type"),
	format = c("tsv", "json")) {
	format = match.arg(format)

	# verify that the accession is valid
	checkmate::assert_string(accession)
	accession_type <- identify_accession_type(accession)
	if (!accession_type %in% c("study_accession", "secondary_study_accession",
	"sample_accession", "secondary_sample_accession",
	"experiment_accession", "run_accession")) {
	stop(
	glue_collapse(
	c("Sorry, file reports are only available for:",
	" - Study accessions (ERP, SRP, DRP, PRJ prefixes)",
	" - Experiment accessions (ERX, SRX, DRX prefixes)",
	" - Sample accessions (ERS, SRS, DRS, SAM prefixes)",
	" - Run accessions (ERR, SRR, DRR prefixes)"), sep = "\n")
	)
	}

	r <- httr::GET(
	url = "https://www.ebi.ac.uk/ena/portal/api/filereport",
	query = list(accession = accession,
	fields = glue_collapse(fields, sep = ","),
	format = format,
	result = "read_run")
	)
	httr::stop_for_status(r)
	httr::content(r, as = "parsed", type = "text/tab-separated-values",
	encoding = "UTF-8", show_col_types = FALSE)
	}
	# examples
	identify_accession_type(c("SRS6107963", "SAMN14003257"))

	res1 <- get_accessions("PRJNA552470")
	res2 <- get_accessions("SRP212869")
	dplyr::all_equal(res1, res2)

	get_accessions("SRS6107963")
	get_accessions("SRS6107963", format = "json")
	get_accessions(c("SAMN12207420", "SAMN12207419"))
	get_accessions(c("SRR11028503", "SAMN12207419"))
	get_accessions("SRX7680900")
	get_accessions("SAMN14003257")
	get_accessions("GSE133758")
	get_accessions("GSM4296704")

	get_records("SRR11028503")
	get_records(c("SAMN12207420", "SAMN12207419"))

	get_filereport("SRS6107963")
	get_filereport("SRR11028503")
	get_filereport("SAMN12207420")
	get_filereport("PRJNA552470")
	# get_filereport("GSM4296704") # this accession type is not supported

	# looking up studies by project id
	get_records("PRJNA552470") %>%
	xml2::as_list() %>%
	purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "PRIMARY_ID")
	get_records("PRJNA552470") %>%
	xml2::as_list() %>%
	purrr::pluck("PROJECT_SET", "PROJECT", "IDENTIFIERS", "SECONDARY_ID")

	# looking up the same study by study id returns a different xml document
	get_records("SRP212869") %>%
	xml2::as_list() %>%
	purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "PRIMARY_ID")
	get_records("SRP212869") %>%
	xml2::as_list() %>%
	purrr::pluck("STUDY_SET", "STUDY", "IDENTIFIERS", "SECONDARY_ID")

	# inspect the structure of a record
	record <- get_records("SRR11028503")
	xml2::xml_structure(record)

	# to expand the xml object into a list
	xml2::as_list(record)

	# to visualize the xml file in RStudio's viewer pane
	htmltidy::xml_view(record)
	htmltidy::xml_tree_view(record)

	# extracting elements using xpath queries
	records <- get_records(c("SAMN12207420", "SAMN12207419"))
	xml_find_all(records, xpath = "//PRIMARY_ID")
	xml_text(xml_find_all(records, xpath = "//PRIMARY_ID"))
	xml_attrs(xml_find_all(records, xpath = "//SAMPLE"))
	xml_attr(xml_find_all(records, xpath = "//SAMPLE"), "accession")

	# extract attributes as tag-value pairs from sample record(s)
	records <- get_records(c("SAMN12207420", "SAMN12207419"))
	samples <- xml_find_all(records, "//SAMPLE_SET/SAMPLE")
	sample_attr <- map(samples, function(x) {
	# we implicitely assume that tags and values always come in pairs
	df <- data.frame(
	tags = xml_text(
	xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/TAG")),
	values = xml_text(
	xml_find_all(x, "SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE"))
	)
	df <- df[!grepl("^ENA-", df$tags), ]
	})
	names(sample_attr) <- xml_text(xml_find_all(records, xpath = "//PRIMARY_ID"))
	sample_attr