ConradStack · June 8, 2015 22:22
diff --git a/scrape.r b/scrape.r
 require(RCurl)
 require(XML)
 require(stringr)

 pull.to = "~/tmp"
 misc.dir = sprintf("%s/lectures",pull.to)
 base.url = "http://www.stat.cmu.edu/~cshalizi/uADA/15/"
 lns = readLines(base.url)
 top = grep("<h2>schedule",lns,ignore.case=T)
 use = lns[top:length(lns)]

 matched <- str_match_all(use, "<a href=\"(.*?)\"")
 to.rm = which(sapply(matched, nrow) == 0)
 matched = matched[-to.rm]

 for(xx in matched){
 	for(rel.url in xx[,2]){
 		if( grepl("(pdf|csv|dat|R)$", rel.url, ignore.case=T) ){
 			# Download
 			remote.file = sprintf("%s%s",base.url,rel.url)
 			local.file = sprintf("%s/%s",pull.to, rel.url )

 			#local.file = sprintf("%s/%s",pull.to, basename(rel.url) )
 			if(grepl("^http",rel.url,ignore.case=T)){
 				# Download to misc subdirectory
 				if(!file.exists(misc.dir)){
 					dir.create(misc.dir,recursive=T)
 				}
 				remote.file = rel.url
 				local.file = sprintf("%s/%s", misc.dir, basename(rel.url) )	
 			} else {

 				# make sure local directory exists
 				local.dir = sprintf("%s/%s",pull.to, dirname(rel.url))
 				if(!file.exists(local.dir)){
 					dir.create(local.dir,recursive=T)
 				}

 			}

 			cat( sprintf("Trying to download %s ... ",remote.file) )
 			download.status = download.file(remote.file, local.file, quiet=TRUE) 
 			if(download.status == 0){
 				# Success!
 				cat("success\n")
 			} else {
 				# Failed :(
 				cat("failed\n")
 			}
 		} else {
 			# Don't download
 			cat( sprintf("Skipping %s\n",rel.url) )
 		}
 	}
 }
	require(RCurl)
	require(XML)
	require(stringr)

	pull.to = "~/tmp"
	misc.dir = sprintf("%s/lectures",pull.to)
	base.url = "http://www.stat.cmu.edu/~cshalizi/uADA/15/"
	lns = readLines(base.url)
	top = grep("<h2>schedule",lns,ignore.case=T)
	use = lns[top:length(lns)]

	matched <- str_match_all(use, "<a href=\"(.*?)\"")
	to.rm = which(sapply(matched, nrow) == 0)
	matched = matched[-to.rm]

	for(xx in matched){
	for(rel.url in xx[,2]){
	if( grepl("(pdf\|csv\|dat\|R)$", rel.url, ignore.case=T) ){
	# Download
	remote.file = sprintf("%s%s",base.url,rel.url)
	local.file = sprintf("%s/%s",pull.to, rel.url )

	#local.file = sprintf("%s/%s",pull.to, basename(rel.url) )
	if(grepl("^http",rel.url,ignore.case=T)){
	# Download to misc subdirectory
	if(!file.exists(misc.dir)){
	dir.create(misc.dir,recursive=T)
	}
	remote.file = rel.url
	local.file = sprintf("%s/%s", misc.dir, basename(rel.url) )
	} else {

	# make sure local directory exists
	local.dir = sprintf("%s/%s",pull.to, dirname(rel.url))
	if(!file.exists(local.dir)){
	dir.create(local.dir,recursive=T)
	}

	}

	cat( sprintf("Trying to download %s ... ",remote.file) )
	download.status = download.file(remote.file, local.file, quiet=TRUE)
	if(download.status == 0){
	# Success!
	cat("success\n")
	} else {
	# Failed :(
	cat("failed\n")
	}
	} else {
	# Don't download
	cat( sprintf("Skipping %s\n",rel.url) )
	}
	}
	}