Last active
January 15, 2021 17:49
-
-
Save padpadpadpad/cb426ce3d7e32cc16e0da4e75632af13 to your computer and use it in GitHub Desktop.
scrape the ncbi website to get the summary table for the genome assembly of an organism
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# script to grab assembly information from the NCBI website | |
# I found the XPath of the table I wanted by accessing a NCBI genome page using Google Chrome | |
# I then clicked View -> Developer -> Inspect elements | |
# After find where in the html the table was, I right-clicked the code and selected Copy -> Copy XPath | |
# load in packages #### | |
library(rvest) | |
library(tidyverse) | |
# example genome - Pseudomonas fluorescens SBW25 | |
assembly_accession <- 'GCF_000009225.2' | |
# set base URL | |
base_url <- "https://www.ncbi.nlm.nih.gov/assembly" | |
# specify genome path | |
url <- file.path(base_url, assembly_accession) | |
# read in html using rvest | |
temp <- read_html(url) %>% | |
html_node(xpath = '//*[@id="summary"]/dl') %>% | |
html_children() %>% | |
html_text() | |
temp <- tibble(ID = temp[c(TRUE, FALSE)], | |
value = temp[c(FALSE, TRUE)]) | |
temp | |
# example of how to run it in a for loop | |
# list accession numbers to genomes | |
genomes <- list.files('genome_output/') | |
# run a loop to scrape the html of the ncbi website for each genome summary table. | |
# set base URL | |
base_url <- "https://www.ncbi.nlm.nih.gov/assembly" | |
for(i in 1:length(genomes)){ | |
# specify genome path | |
url <- file.path(base_url, genomes[i]) | |
# read in html using rvest | |
temp <- read_html(url) %>% | |
html_node(xpath = '//*[@id="summary"]/dl') %>% | |
html_children() %>% | |
html_text() | |
temp <- tibble(ID = temp[c(TRUE, FALSE)], | |
value = temp[c(FALSE, TRUE)]) | |
write.table(temp, paste('genome_output/', genomes[i], '/ncbi_summary.txt', sep = ''), row.names = FALSE, quote = FALSE) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment