padpadpadpad · January 15, 2021 17:49
diff --git a/scrape_ncbi_assembly_summary.R b/scrape_ncbi_assembly_summary.R
 # script to grab assembly information from the NCBI website
 # I found the XPath of the table I wanted by accessing a NCBI genome page using Google Chrome
 # I then clicked View -> Developer -> Inspect elements
 # After find where in the html the table was, I right-clicked the code and selected Copy -> Copy XPath

 # load in packages ####
 library(rvest)
 library(tidyverse)
 # example genome - Pseudomonas fluorescens SBW25
 assembly_accession <- 'GCF_000009225.2'
 # set base URL
 base_url <- "https://www.ncbi.nlm.nih.gov/assembly"
 # specify genome path
 url <- file.path(base_url, assembly_accession)
 # read in html using rvest
 temp <- read_html(url)  %>%
  html_node(xpath = '//*[@id="summary"]/dl') %>%
  html_children() %>%
  html_text()
 temp <- tibble(ID = temp[c(TRUE, FALSE)],
               value = temp[c(FALSE, TRUE)])
 temp

 # example of how to run it in a for loop

 # list accession numbers to genomes
 genomes <- list.files('genome_output/')

 # run a loop to scrape the html of the ncbi website for each genome summary table.

 # set base URL
 base_url <- "https://www.ncbi.nlm.nih.gov/assembly"

 for(i in 1:length(genomes)){
  
  # specify genome path
  url <- file.path(base_url, genomes[i])
  
  # read in html using rvest
  temp <- read_html(url)  %>%
    html_node(xpath = '//*[@id="summary"]/dl') %>%
    html_children() %>%
    html_text()
  
  temp <- tibble(ID = temp[c(TRUE, FALSE)],
                 value = temp[c(FALSE, TRUE)])
  
  write.table(temp, paste('genome_output/', genomes[i], '/ncbi_summary.txt', sep = ''), row.names = FALSE, quote = FALSE)
  
 }
	# script to grab assembly information from the NCBI website
	# I found the XPath of the table I wanted by accessing a NCBI genome page using Google Chrome
	# I then clicked View -> Developer -> Inspect elements
	# After find where in the html the table was, I right-clicked the code and selected Copy -> Copy XPath

	# load in packages ####
	library(rvest)
	library(tidyverse)
	# example genome - Pseudomonas fluorescens SBW25
	assembly_accession <- 'GCF_000009225.2'
	# set base URL
	base_url <- "https://www.ncbi.nlm.nih.gov/assembly"
	# specify genome path
	url <- file.path(base_url, assembly_accession)
	# read in html using rvest
	temp <- read_html(url) %>%
	html_node(xpath = '//*[@id="summary"]/dl') %>%
	html_children() %>%
	html_text()
	temp <- tibble(ID = temp[c(TRUE, FALSE)],
	value = temp[c(FALSE, TRUE)])
	temp

	# example of how to run it in a for loop

	# list accession numbers to genomes
	genomes <- list.files('genome_output/')

	# run a loop to scrape the html of the ncbi website for each genome summary table.

	# set base URL
	base_url <- "https://www.ncbi.nlm.nih.gov/assembly"

	for(i in 1:length(genomes)){

	# specify genome path
	url <- file.path(base_url, genomes[i])

	# read in html using rvest
	temp <- read_html(url) %>%
	html_node(xpath = '//*[@id="summary"]/dl') %>%
	html_children() %>%
	html_text()

	temp <- tibble(ID = temp[c(TRUE, FALSE)],
	value = temp[c(FALSE, TRUE)])

	write.table(temp, paste('genome_output/', genomes[i], '/ncbi_summary.txt', sep = ''), row.names = FALSE, quote = FALSE)

	}