Skip to content

Instantly share code, notes, and snippets.

@tomsing1
Created June 3, 2022 00:50
Show Gist options
  • Select an option

  • Save tomsing1/ad769fa0394bf9ce04b4f786528762a5 to your computer and use it in GitHub Desktop.

Select an option

Save tomsing1/ad769fa0394bf9ce04b4f786528762a5 to your computer and use it in GitHub Desktop.
Querying NCBI GEO and Biosample databases from R using the rentrez package
library(glue)
library(purrr)
library(rentrez)
library(snakecase)
library(tidyr)
library(xml2)
rentrez::entrez_dbs() # for reference: available databases
# query NCBI GEO for information about Series GSE178265
kStudy <- "GSE178265"
terms <- entrez_db_searchable(db = "gds") # for reference: searchable fields
# first, get additional information about this series, including its db ID.
series <- entrez_search("gds",
term = glue("{kStudy}[ACCN] AND gse[ETYP]"))
# use the ID to fetch details, including the sample table
series_summary <- entrez_summary(db = "gds", id = series$ids)
# fetch sample annotations as XML records
terms <- entrez_db_searchable(db = "biosample") # for reference: searchable fields
sample_ids <- extract_from_esummary(series_summary, "samples")
sample_accessions <- entrez_search("biosample",
term = glue_collapse(
glue("{sample_ids$accession}"), sep = " OR "),
retmax = 1000L)
# query biosample database for details on each sample
records <- xml2::read_xml(
entrez_fetch("biosample", id = sample_accessions$ids, rettype = "xml")
)
# extract attributes for each sample
samples <- xml2::xml_find_all(records, "//BioSampleSet/BioSample")
df <- purrr::map_df(samples, function(x) {
data.frame(
accession = xml_attr(x, "accession"),
title = xml_text(xml2::xml_find_all(x, "Description/Title")),
biosample = xml_text(xml2::xml_find_all(x, "Ids/Id[@db = 'BioSample']")),
geo = xml_text(xml2::xml_find_all(x, "Ids/Id[@db = 'GEO']")),
organism = xml_attr(xml2::xml_find_all(x, "Description/Organism"),
"taxonomy_name"),
tags = purrr::map_chr(
xml_attrs(xml2::xml_find_all(x, "Attributes/Attribute")),
"attribute_name"),
values = xml2::xml_text(
xml2::xml_find_all(x, "Attributes/Attribute")
)
)
})
# pivot into a wide data.frame
df <- tidyr::pivot_wider(
df,
id_cols = c("accession", "biosample", "geo", "title", "organism"),
names_from = "tags",
values_from = "values")
colnames(df) <- snakecase::to_snake_case(colnames(df))
dim(df)
head(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment