joelnitta · February 5, 2021 02:56
diff --git a/gb_acc_to_taxonomy.R b/gb_acc_to_taxonomy.R
 # Go from genbank accession numbers to taxonomic data from the NCBI database

 # `accs` is a (long) list of genbank accession numbers
 # can only search max ca. 200 at a time because of NCBI limits
 gb_acc <- rentrez::entrez_search(db = "nuccore", term = paste(accs[1:200], collapse = "|"), use_history = TRUE, retmax=200)

 # For each accession, link to the taxonomy database
 gb_tax_links <- rentrez::entrez_link(dbfrom = "nuccore",  db = "taxonomy", id = gb_acc$ids)

 # Query the taxonomy database using the IDs in the links. Output is XML
 tax_results_xml <- rentrez::entrez_fetch(db="taxonomy", id = gb_tax_links$links$nuccore_taxonomy, rettype="xml", retmode = "text")

 # Convert the XML to a list for (somewhat) easier downstream processing
 tax_results_list <-
 tax_results_xml %>%
  stringr::str_split("\\n") %>%
  magrittr::extract2(1) %>%
  XML::xmlToList()
	# Go from genbank accession numbers to taxonomic data from the NCBI database

	# `accs` is a (long) list of genbank accession numbers
	# can only search max ca. 200 at a time because of NCBI limits
	gb_acc <- rentrez::entrez_search(db = "nuccore", term = paste(accs[1:200], collapse = "\|"), use_history = TRUE, retmax=200)

	# For each accession, link to the taxonomy database
	gb_tax_links <- rentrez::entrez_link(dbfrom = "nuccore", db = "taxonomy", id = gb_acc$ids)

	# Query the taxonomy database using the IDs in the links. Output is XML
	tax_results_xml <- rentrez::entrez_fetch(db="taxonomy", id = gb_tax_links$links$nuccore_taxonomy, rettype="xml", retmode = "text")

	# Convert the XML to a list for (somewhat) easier downstream processing
	tax_results_list <-
	tax_results_xml %>%
	stringr::str_split("\\n") %>%
	magrittr::extract2(1) %>%
	XML::xmlToList()