Skip to content

Instantly share code, notes, and snippets.

@ConradStack
Created June 8, 2015 22:22
Show Gist options
  • Save ConradStack/94ad80b640f6467a57f6 to your computer and use it in GitHub Desktop.
Save ConradStack/94ad80b640f6467a57f6 to your computer and use it in GitHub Desktop.
Use R to download links from a simple HTML page
require(RCurl)
require(XML)
require(stringr)
pull.to = "~/tmp"
misc.dir = sprintf("%s/lectures",pull.to)
base.url = "http://www.stat.cmu.edu/~cshalizi/uADA/15/"
lns = readLines(base.url)
top = grep("<h2>schedule",lns,ignore.case=T)
use = lns[top:length(lns)]
matched <- str_match_all(use, "<a href=\"(.*?)\"")
to.rm = which(sapply(matched, nrow) == 0)
matched = matched[-to.rm]
for(xx in matched){
for(rel.url in xx[,2]){
if( grepl("(pdf|csv|dat|R)$", rel.url, ignore.case=T) ){
# Download
remote.file = sprintf("%s%s",base.url,rel.url)
local.file = sprintf("%s/%s",pull.to, rel.url )
#local.file = sprintf("%s/%s",pull.to, basename(rel.url) )
if(grepl("^http",rel.url,ignore.case=T)){
# Download to misc subdirectory
if(!file.exists(misc.dir)){
dir.create(misc.dir,recursive=T)
}
remote.file = rel.url
local.file = sprintf("%s/%s", misc.dir, basename(rel.url) )
} else {
# make sure local directory exists
local.dir = sprintf("%s/%s",pull.to, dirname(rel.url))
if(!file.exists(local.dir)){
dir.create(local.dir,recursive=T)
}
}
cat( sprintf("Trying to download %s ... ",remote.file) )
download.status = download.file(remote.file, local.file, quiet=TRUE)
if(download.status == 0){
# Success!
cat("success\n")
} else {
# Failed :(
cat("failed\n")
}
} else {
# Don't download
cat( sprintf("Skipping %s\n",rel.url) )
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment