vinicius-ianni · February 18, 2018 00:03
diff --git a/distributed-scraper.R b/distributed-scraper.R

 ### SETUP =====================================================================

 # Install package if necessary
 if (!require("WikipediR")) {
  install.packages("WikipediR")
 }

 # Set seed for sampling
 set.seed(1234)

 ### LINKS =====================================================================

 # Recursive function to get multiple levels of links
 get_links <- function(start, levels, so_far = character(0)) {
  
  # End of recursion
  if (levels == 0) {
    return(so_far)
  }
  
  # Extract links from wikipedia response
  links <- page_links("en", "wikipedia", page = start) %>%
    purrr::pluck("query", "pages", 1, "links") %>%
    purrr::map(purrr::keep, .p = is.character) %>%
    purrr::flatten() %>%
    purrr::flatten_chr()
  
  # Join links obtained so far with current links
  links <- unique(c(links, so_far))
  
  # Run `get_links` for each link found
  purrr::map(links, get_links, levels - 1, links)
 }

 # Get two levels of links starting with "R"
 links <- "R (programming language)" %>%
  get_links(2) %>%
  rlang::squash_chr() %>%
  unique() %>%
  purrr::map(~page_info("en", "wikipedia", page = .x)) %>%
  purrr::map(purrr::pluck, "query", "pages", 1, "fullurl") %>%
  rlang::squash_chr()

 ### SEQUENTIAL ================================================================

 # Function to download Wikipedia articles
 download_wiki <- function(url, path) {
  
  # Convert URL into a file name
  file <- url %>%
    utils::URLdecode() %>%
    stringr::str_extract("(?<=/)[^/]+$") %>%
    stringr::str_replace_all("[:punct:]", "") %>%
    stringr::str_to_lower() %>%
    stringr::str_c(normalizePath(path), "/", ., ".html")
  
  # Save page to disk
  httr::GET(url, httr::write_disk(file, TRUE))
  
  return(file)
 }

 # Download all files sequentialy
 files <- purrr::map_chr(links, download_wiki, "~/Desktop/Wiki")

 # Remove all downloaded files for completeness
 purrr::walk(files, file.remove)

 ### PARALLEL ==================================================================

 # Create simplified version of download_wiki()
 download_wiki_ <- purrr::partial(
  download_wiki, path = "~/Desktop/Wiki", .first = FALSE)

 # Download all files in parallel
 files <- parallel::mcmapply(
  download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

 # Remove all downloaded files for completeness
 purrr::walk(files, file.remove)

 ### DISTRIBUTED ===============================================================

 # # Code for the python server on each worker
 # #!/usr/bin/env python
 # 
 # from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
 # import SocketServer
 # from subprocess import call
 # 
 # class S(BaseHTTPRequestHandler):
 #   def _set_headers(self):
 #   self.send_response(200)
 # self.send_header('Content-type', 'text/html')
 # self.end_headers()
 # 
 # def do_POST(self):
 #   content_length = int(self.headers['Content-Length']) # <--- Gets the size of data
 # post_data = self.rfile.read(content_length) # <--- Gets the data itself
 # call(["Rscript", "/home/ctlente/script.R", post_data])
 # 
 # def run(server_class=HTTPServer, handler_class=S, port=80):
 #   server_address = ('', port)
 # httpd = server_class(server_address, handler_class)
 # print 'Starting httpd...'
 # httpd.serve_forever()
 # 
 # if __name__ == "__main__":
 #   from sys import argv
 # 
 # if len(argv) == 2:
 #   run(port=int(argv[1]))
 # else:
 #   run()

 # Code for the R script on each worker
 # #!/usr/bin/env Rscript
 # args = commandArgs(trailingOnly=TRUE)
 # library(magrittr)
 # 
 # links <- stringr::str_split(args[1], " ")[[1]]
 # 
 # download_wiki <- function(url, path) {
 #   
 #   file <- url %>%
 #     utils::URLdecode() %>%
 #     stringr::str_extract("(?<=/)[^/]+$") %>%
 #     stringr::str_replace_all("[:punct:]", "") %>%
 #     stringr::str_to_lower() %>%
 #     stringr::str_c(normalizePath(path), "/", ., ".html")
 #   
 #   httr::GET(url, httr::write_disk(file, TRUE))
 #   
 #   return(file)
 # }
 # 
 # download_wiki_ <- purrr::partial(
 #   download_wiki, path = "~/wiki", .first = FALSE)
 # 
 # parallel::mcmapply(
 #   download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

 # Split links into groups
 num_workers <- 3
 links_split <- links %>%
  split(., ceiling(seq_along(.)/(length(.)/num_workers))) %>%
  purrr::map(stringr::str_c, collapse = " ")

 # Endpoint data
 workers <- "localhost" # INSERT HERE YOUR WORKERS' IPS
 endpoints <- stringr::str_c("http://", workers, ":8000")

 # Call each worker but don't wait for them
 for (i in seq_len(num_workers)) {
  system(paste0("curl -d '", links_split[[i]], "' ", endpoints[i]), wait = FALSE)
 }

	### SETUP =====================================================================

	# Install package if necessary
	if (!require("WikipediR")) {
	install.packages("WikipediR")
	}

	# Set seed for sampling
	set.seed(1234)

	### LINKS =====================================================================

	# Recursive function to get multiple levels of links
	get_links <- function(start, levels, so_far = character(0)) {

	# End of recursion
	if (levels == 0) {
	return(so_far)
	}

	# Extract links from wikipedia response
	links <- page_links("en", "wikipedia", page = start) %>%
	purrr::pluck("query", "pages", 1, "links") %>%
	purrr::map(purrr::keep, .p = is.character) %>%
	purrr::flatten() %>%
	purrr::flatten_chr()

	# Join links obtained so far with current links
	links <- unique(c(links, so_far))

	# Run `get_links` for each link found
	purrr::map(links, get_links, levels - 1, links)
	}

	# Get two levels of links starting with "R"
	links <- "R (programming language)" %>%
	get_links(2) %>%
	rlang::squash_chr() %>%
	unique() %>%
	purrr::map(~page_info("en", "wikipedia", page = .x)) %>%
	purrr::map(purrr::pluck, "query", "pages", 1, "fullurl") %>%
	rlang::squash_chr()

	### SEQUENTIAL ================================================================

	# Function to download Wikipedia articles
	download_wiki <- function(url, path) {

	# Convert URL into a file name
	file <- url %>%
	utils::URLdecode() %>%
	stringr::str_extract("(?<=/)[^/]+$") %>%
	stringr::str_replace_all("[:punct:]", "") %>%
	stringr::str_to_lower() %>%
	stringr::str_c(normalizePath(path), "/", ., ".html")

	# Save page to disk
	httr::GET(url, httr::write_disk(file, TRUE))

	return(file)
	}

	# Download all files sequentialy
	files <- purrr::map_chr(links, download_wiki, "~/Desktop/Wiki")

	# Remove all downloaded files for completeness
	purrr::walk(files, file.remove)

	### PARALLEL ==================================================================

	# Create simplified version of download_wiki()
	download_wiki_ <- purrr::partial(
	download_wiki, path = "~/Desktop/Wiki", .first = FALSE)

	# Download all files in parallel
	files <- parallel::mcmapply(
	download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

	# Remove all downloaded files for completeness
	purrr::walk(files, file.remove)

	### DISTRIBUTED ===============================================================

	# # Code for the python server on each worker
	# #!/usr/bin/env python
	#
	# from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
	# import SocketServer
	# from subprocess import call
	#
	# class S(BaseHTTPRequestHandler):
	# def _set_headers(self):
	# self.send_response(200)
	# self.send_header('Content-type', 'text/html')
	# self.end_headers()
	#
	# def do_POST(self):
	# content_length = int(self.headers['Content-Length']) # <--- Gets the size of data
	# post_data = self.rfile.read(content_length) # <--- Gets the data itself
	# call(["Rscript", "/home/ctlente/script.R", post_data])
	#
	# def run(server_class=HTTPServer, handler_class=S, port=80):
	# server_address = ('', port)
	# httpd = server_class(server_address, handler_class)
	# print 'Starting httpd...'
	# httpd.serve_forever()
	#
	# if __name__ == "__main__":
	# from sys import argv
	#
	# if len(argv) == 2:
	# run(port=int(argv[1]))
	# else:
	# run()

	# Code for the R script on each worker
	# #!/usr/bin/env Rscript
	# args = commandArgs(trailingOnly=TRUE)
	# library(magrittr)
	#
	# links <- stringr::str_split(args[1], " ")[[1]]
	#
	# download_wiki <- function(url, path) {
	#
	# file <- url %>%
	# utils::URLdecode() %>%
	# stringr::str_extract("(?<=/)[^/]+$") %>%
	# stringr::str_replace_all("[:punct:]", "") %>%
	# stringr::str_to_lower() %>%
	# stringr::str_c(normalizePath(path), "/", ., ".html")
	#
	# httr::GET(url, httr::write_disk(file, TRUE))
	#
	# return(file)
	# }
	#
	# download_wiki_ <- purrr::partial(
	# download_wiki, path = "~/wiki", .first = FALSE)
	#
	# parallel::mcmapply(
	# download_wiki_, links, SIMPLIFY = TRUE, mc.cores = 4)

	# Split links into groups
	num_workers <- 3
	links_split <- links %>%
	split(., ceiling(seq_along(.)/(length(.)/num_workers))) %>%
	purrr::map(stringr::str_c, collapse = " ")

	# Endpoint data
	workers <- "localhost" # INSERT HERE YOUR WORKERS' IPS
	endpoints <- stringr::str_c("http://", workers, ":8000")

	# Call each worker but don't wait for them
	for (i in seq_len(num_workers)) {
	system(paste0("curl -d '", links_split[[i]], "' ", endpoints[i]), wait = FALSE)
	}