-
-
Save tylerritchie/7e9fa32bbc82c242c7f918fc441a57c4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(rvest) | |
require(httr) | |
setwd('~/p/species-scrape') | |
output_file_name<-'output.txt' | |
error_urls_file_name<-'error_urls.txt' | |
for(i in 6174:45000){ #or 1, or whatever | |
url <- paste0("http://naeb.brit.org/uses/",i) # | |
message("Scraping ", url) | |
tryCatch({ | |
# RETRY() will default to 3 retries and add some jitter to the retries as well | |
# hopefully it takes the same timeout arguments GET() does, i've uped the timeout to 15 seconds | |
# pipe the url to RETRY() then feed that output into read_html (the internet says this works) | |
use_page_i <- url %>% RETRY("GET", ., timeout(15)) %>% read_html | |
links_i <- use_page_i %>% html_elements("a") | |
species_i <- links_i[8] %>% html_text2() | |
write(species_i,file=output_file_name, append=TRUE) | |
}, | |
# on an error (which should happen after 3 retries) write the url to the other file we can then | |
# read this back in as an input for a re-scrape attempt | |
error=function(err) { | |
message("Balls, problem with: ", url) | |
message(err) | |
write(url,file=error_urls_file_name, append=TRUE) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment