abelsonlive · September 23, 2012 09:19
diff --git a/srapeshell.R b/srapeshell.R
 # best practices for web scraping in R #

 # function should be used with ldply
 # eg: 

 ldply(urls, scrape)

 # add a try to ignore broken links/ unresponsive pages
 # eg: 

 ldply(urls, function(url){
                out = try(scrape(url))
                  if(class(out)=='try-error') next;
                return(out)
                })

 # insert some random sleep interval to prevent getting booted
 # eg: 

 ldply(urls, function(url){
                out = try(scrape(url))
                  if(class(out)=='try-error') next;
                Sys.sleep(sample(seq(1, 3, by=0.001), 1))
                return(out)
                })

 scrape <- function(url)
 {
  if(!require('XML')){
    install.packages('XML')
    library('XML')  
  }
  if(!require('RCurl')){
    install.packages('RCurl')
    library('RCurl')	
  }
  if(!require('plyr')){
    install.packages('plyr')
    library('plyr')	
  }
  if(!require('stringr')){
    install.packages('stringr')
    library('stringr')	
  }
  df = data.frame(url=url, stringsAsFactors=F)
  
  #download page, use "readLines" if "getURL" fails
  html = try(getURL(df$url))
  if(class(html)=='try-error'){
    html = readLines(df$url, warn=F)
  }
      tree = htmlTreeParse(html, useInternalNodes=T)
      #@@@@@@@@@@@@@@@@@@@@#
      |                    |
      | ENTER XPATH HERE:  |
      |                    |
      #$$$$$$$$$$$$$$$$$$$$#
      
    }
  }
 return(data.frame(df, stringsAsFactors=F))
 }
	# best practices for web scraping in R #

	# function should be used with ldply
	# eg:

	ldply(urls, scrape)

	# add a try to ignore broken links/ unresponsive pages
	# eg:

	ldply(urls, function(url){
	out = try(scrape(url))
	if(class(out)=='try-error') next;
	return(out)
	})

	# insert some random sleep interval to prevent getting booted
	# eg:

	ldply(urls, function(url){
	out = try(scrape(url))
	if(class(out)=='try-error') next;
	Sys.sleep(sample(seq(1, 3, by=0.001), 1))
	return(out)
	})

	scrape <- function(url)
	{
	if(!require('XML')){
	install.packages('XML')
	library('XML')
	}
	if(!require('RCurl')){
	install.packages('RCurl')
	library('RCurl')
	}
	if(!require('plyr')){
	install.packages('plyr')
	library('plyr')
	}
	if(!require('stringr')){
	install.packages('stringr')
	library('stringr')
	}
	df = data.frame(url=url, stringsAsFactors=F)

	#download page, use "readLines" if "getURL" fails
	html = try(getURL(df$url))
	if(class(html)=='try-error'){
	html = readLines(df$url, warn=F)
	}
	tree = htmlTreeParse(html, useInternalNodes=T)
	#@@@@@@@@@@@@@@@@@@@@#
	\| \|
	\| ENTER XPATH HERE: \|
	\| \|
	#$$$$$$$$$$$$$$$$$$$$#

	}
	}
	return(data.frame(df, stringsAsFactors=F))
	}
No results found