tomsing1 · June 3, 2022 00:50
diff --git a/rentrez_geo_biosample.R b/rentrez_geo_biosample.R
 library(glue)
 library(purrr)
 library(rentrez)
 library(snakecase)
 library(tidyr)
 library(xml2)

 rentrez::entrez_dbs()  # for reference: available databases

 # query NCBI GEO for information about Series GSE178265
 kStudy <- "GSE178265"
 terms <- entrez_db_searchable(db = "gds")  # for reference: searchable fields
 # first, get additional information about this series, including its db ID.
 series <- entrez_search("gds",
                        term = glue("{kStudy}[ACCN] AND gse[ETYP]"))
 # use the ID to fetch details, including the sample table
 series_summary <- entrez_summary(db = "gds", id = series$ids)

 # fetch sample annotations as XML records
 terms <- entrez_db_searchable(db = "biosample")  # for reference: searchable fields
 sample_ids <- extract_from_esummary(series_summary, "samples")
 sample_accessions <- entrez_search("biosample",
                        term = glue_collapse(
                          glue("{sample_ids$accession}"), sep = " OR "),
                        retmax = 1000L)

 # query biosample database for details on each sample
 records <- xml2::read_xml(
  entrez_fetch("biosample", id = sample_accessions$ids, rettype = "xml")
 )
 # extract attributes for each sample
 samples <- xml2::xml_find_all(records, "//BioSampleSet/BioSample")
 df <- purrr::map_df(samples, function(x) {
  data.frame(
    accession = xml_attr(x, "accession"),
    title = xml_text(xml2::xml_find_all(x, "Description/Title")),
    biosample = xml_text(xml2::xml_find_all(x, "Ids/Id[@db = 'BioSample']")),
    geo = xml_text(xml2::xml_find_all(x, "Ids/Id[@db = 'GEO']")),
    organism = xml_attr(xml2::xml_find_all(x, "Description/Organism"),
      "taxonomy_name"),
    tags = purrr::map_chr(
      xml_attrs(xml2::xml_find_all(x, "Attributes/Attribute")),
      "attribute_name"),
    values = xml2::xml_text(
      xml2::xml_find_all(x, "Attributes/Attribute")
    )
  )
 })
 # pivot into a wide data.frame
 df <- tidyr::pivot_wider(
  df,
  id_cols = c("accession", "biosample", "geo", "title", "organism"),
  names_from = "tags",
  values_from = "values")
 colnames(df) <- snakecase::to_snake_case(colnames(df))
 dim(df)
 head(df)
	library(glue)
	library(purrr)
	library(rentrez)
	library(snakecase)
	library(tidyr)
	library(xml2)

	rentrez::entrez_dbs() # for reference: available databases

	# query NCBI GEO for information about Series GSE178265
	kStudy <- "GSE178265"
	terms <- entrez_db_searchable(db = "gds") # for reference: searchable fields
	# first, get additional information about this series, including its db ID.
	series <- entrez_search("gds",
	term = glue("{kStudy}[ACCN] AND gse[ETYP]"))
	# use the ID to fetch details, including the sample table
	series_summary <- entrez_summary(db = "gds", id = series$ids)

	# fetch sample annotations as XML records
	terms <- entrez_db_searchable(db = "biosample") # for reference: searchable fields
	sample_ids <- extract_from_esummary(series_summary, "samples")
	sample_accessions <- entrez_search("biosample",
	term = glue_collapse(
	glue("{sample_ids$accession}"), sep = " OR "),
	retmax = 1000L)

	# query biosample database for details on each sample
	records <- xml2::read_xml(
	entrez_fetch("biosample", id = sample_accessions$ids, rettype = "xml")
	)
	# extract attributes for each sample
	samples <- xml2::xml_find_all(records, "//BioSampleSet/BioSample")
	df <- purrr::map_df(samples, function(x) {
	data.frame(
	accession = xml_attr(x, "accession"),
	title = xml_text(xml2::xml_find_all(x, "Description/Title")),
	biosample = xml_text(xml2::xml_find_all(x, "Ids/Id[@db = 'BioSample']")),
	geo = xml_text(xml2::xml_find_all(x, "Ids/Id[@db = 'GEO']")),
	organism = xml_attr(xml2::xml_find_all(x, "Description/Organism"),
	"taxonomy_name"),
	tags = purrr::map_chr(
	xml_attrs(xml2::xml_find_all(x, "Attributes/Attribute")),
	"attribute_name"),
	values = xml2::xml_text(
	xml2::xml_find_all(x, "Attributes/Attribute")
	)
	)
	})
	# pivot into a wide data.frame
	df <- tidyr::pivot_wider(
	df,
	id_cols = c("accession", "biosample", "geo", "title", "organism"),
	names_from = "tags",
	values_from = "values")
	colnames(df) <- snakecase::to_snake_case(colnames(df))
	dim(df)
	head(df)
No results found