Last active
February 5, 2021 02:56
-
-
Save joelnitta/b40f820b0fe94658f81669b9904b4965 to your computer and use it in GitHub Desktop.
Go from genbank accession numbers to taxonomic data in the NCBI database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Go from genbank accession numbers to taxonomic data from the NCBI database | |
# `accs` is a (long) list of genbank accession numbers | |
# can only search max ca. 200 at a time because of NCBI limits | |
gb_acc <- rentrez::entrez_search(db = "nuccore", term = paste(accs[1:200], collapse = "|"), use_history = TRUE, retmax=200) | |
# For each accession, link to the taxonomy database | |
gb_tax_links <- rentrez::entrez_link(dbfrom = "nuccore", db = "taxonomy", id = gb_acc$ids) | |
# Query the taxonomy database using the IDs in the links. Output is XML | |
tax_results_xml <- rentrez::entrez_fetch(db="taxonomy", id = gb_tax_links$links$nuccore_taxonomy, rettype="xml", retmode = "text") | |
# Convert the XML to a list for (somewhat) easier downstream processing | |
tax_results_list <- | |
tax_results_xml %>% | |
stringr::str_split("\\n") %>% | |
magrittr::extract2(1) %>% | |
XML::xmlToList() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment