tylerritchie · April 3, 2021 02:35
diff --git a/naeb b/naeb
 require(rvest)
 require(httr)
 setwd('~/p/species-scrape')
 output_file_name<-'output.txt'
 error_urls_file_name<-'error_urls.txt'
 for(i in 6174:45000){  #or 1, or whatever
  url <- paste0("http://naeb.brit.org/uses/",i) #
  message("Scraping ", url)
  tryCatch({
    # RETRY() will default to 3 retries and add some jitter to the retries as well
    # hopefully it takes the same timeout arguments GET() does, i've uped the timeout to 15 seconds
    # pipe the url to RETRY() then feed that output into read_html (the internet says this works)
    use_page_i <- url %>% RETRY("GET", ., timeout(15)) %>% read_html
    links_i <- use_page_i %>% html_elements("a")
    species_i <- links_i[8] %>% html_text2()
    write(species_i,file=output_file_name, append=TRUE)
  },
  # on an error (which should happen after 3 retries) write the url to the other file we can then
  # read this back in as an input for a re-scrape attempt
  error=function(err) {
    message("Balls, problem with: ", url)
    message(err)
    write(url,file=error_urls_file_name, append=TRUE)
  })
 }
	require(rvest)
	require(httr)
	setwd('~/p/species-scrape')
	output_file_name<-'output.txt'
	error_urls_file_name<-'error_urls.txt'
	for(i in 6174:45000){ #or 1, or whatever
	url <- paste0("http://naeb.brit.org/uses/",i) #
	message("Scraping ", url)
	tryCatch({
	# RETRY() will default to 3 retries and add some jitter to the retries as well
	# hopefully it takes the same timeout arguments GET() does, i've uped the timeout to 15 seconds
	# pipe the url to RETRY() then feed that output into read_html (the internet says this works)
	use_page_i <- url %>% RETRY("GET", ., timeout(15)) %>% read_html
	links_i <- use_page_i %>% html_elements("a")
	species_i <- links_i[8] %>% html_text2()
	write(species_i,file=output_file_name, append=TRUE)
	},
	# on an error (which should happen after 3 retries) write the url to the other file we can then
	# read this back in as an input for a re-scrape attempt
	error=function(err) {
	message("Balls, problem with: ", url)
	message(err)
	write(url,file=error_urls_file_name, append=TRUE)
	})
	}