matt-dray · May 22, 2018 10:55
diff --git a/Rcrawler-link-extract.R b/Rcrawler-link-extract.R
 # The Rcrawler package for website crawling
 # Matt Dray
 # May 2018

 # the need: extract hyperlinks containing certain string
 # this code hasn't actually been tested

 # install.packages("Rcrawler")
 library(Rcrawler)   

 website <- "https://rostrum.blog/"
 
 Rcrawler::Rcrawler(
  Website = website,
  ExtractXpathPat = c('//*/a[contains(@href, "wiki")]'),  # extract URLS containing this term
  # ExtractXpathPat = c("//*/a/@href"),  # extract *all* URLs
  PatternsNames = c("link"),  # name of column
  MaxDepth = 2,  # how many links deep
  ManyPerPattern = TRUE,  # return all instances
  DIR = "html_wiki",  # directory to add the HTML files to
  Obeyrobots = TRUE,  # do what obeyrobots.txt file says
  statslinks = FALSE,  # provide detail of ins, outs, etc
  no_cores = 2  # cores
 )
	# The Rcrawler package for website crawling
	# Matt Dray
	# May 2018

	# the need: extract hyperlinks containing certain string
	# this code hasn't actually been tested

	# install.packages("Rcrawler")
	library(Rcrawler)

	website <- "https://rostrum.blog/"

	Rcrawler::Rcrawler(
	Website = website,
	ExtractXpathPat = c('//*/a[contains(@href, "wiki")]'), # extract URLS containing this term
	# ExtractXpathPat = c("///a/@href"), # extract all* URLs
	PatternsNames = c("link"), # name of column
	MaxDepth = 2, # how many links deep
	ManyPerPattern = TRUE, # return all instances
	DIR = "html_wiki", # directory to add the HTML files to
	Obeyrobots = TRUE, # do what obeyrobots.txt file says
	statslinks = FALSE, # provide detail of ins, outs, etc
	no_cores = 2 # cores
	)