yutannihilation · August 29, 2015 14:15
diff --git a/b.R b/b.R
 library(rvest)
 library(httr)
 library(dplyr)
 library(purrr)

 # get URLs of detail pages
 get_links <- function(num) {
  url <- sprintf("http://www.botswanatourism.co.bw/accommodation?field_facility_region_tid=All&title=&page=%d", num)
  
  links <- html(url) %>%
    html_nodes("td.views-field-title a") %>%
    html_attr("href") %>%
    paste0("http://www.botswanatourism.co.bw", .)
 }

 # get last page number
 get_pagecount <- function() {
  html("http://www.botswanatourism.co.bw/accommodation") %>%
    html_node("li.pager-last a") %>%
    html_attr("href") %>%
    sub("^.*=([[:digit:]]+)$", "\\1", .)
 }

 # get data from each detail page
 get_data <- function(url) {
  
  # Prevent unintentional DDoS
  Sys.sleep(10)
  
  # Scrape whole body
  body <- html(url)
  
  # Get field nodes, which are pairs of the label and value
  fields <- html_nodes(body, 'div.field')
  
  # Get labels
  info_labels <- fields %>%
    html_node('div.field-label') %>%
    html_text %>%
    sub(":\\W$", "", .)
  
  # Labels might be NA. First is Summary, the others are considered as noises
  info_labels[1] <- "Summary"
  info_labels[is.na(info_labels)] <- "NA"
  
  # Get items. It is no always that one label has one item; items can be multiple.
  info_items <- fields %>%
    map(~ html_nodes(., 'div.field-item') %>%
          html_text %>%
          paste(collapse = " ")) %>%
    gsub("(\\W*</?p>|\\n\\W*)" , "", .)
  
  # Combine then into a list
  result <- as.list(info_items)
  names(result) <- info_labels
  result$title <- body %>%
    html_nodes('#page-title') %>%
    html_text
  
  # Return as a data_frame
  as_data_frame(result)
 }

 #==== Main ====

 # Get URLs of individual pages
 links <- list()

 pagecount <- as.numeric(get_pagecount())

 for(i in (length(links)):pagecount){
  cat("Scraping list pages", i, "...\n")
  links[[i]] <- get_links(i)
 }

 # flatten a list of lists into one array
 links <- unlist(links)


 # Scrape page by page
 result <- list()

 for(i in (length(result) + 1):length(links)){
  cat("Scraping detail pages", i, "...\n")
  result[[i]] <- get_data(links[i])
 }

 # Export data as CSV
 write.csv(bind_rows(result), file = "botswanatourism.csv", row.names = FALSE)
 write.csv(bind_rows(result)[,c(15,1,2,3,4,5,6,7,8,9,10,18)], file = "botswanatourism_minimal.csv", row.names = FALSE)
	library(rvest)
	library(httr)
	library(dplyr)
	library(purrr)

	# get URLs of detail pages
	get_links <- function(num) {
	url <- sprintf("http://www.botswanatourism.co.bw/accommodation?field_facility_region_tid=All&title=&page=%d", num)

	links <- html(url) %>%
	html_nodes("td.views-field-title a") %>%
	html_attr("href") %>%
	paste0("http://www.botswanatourism.co.bw", .)
	}

	# get last page number
	get_pagecount <- function() {
	html("http://www.botswanatourism.co.bw/accommodation") %>%
	html_node("li.pager-last a") %>%
	html_attr("href") %>%
	sub("^.*=([[:digit:]]+)$", "\\1", .)
	}

	# get data from each detail page
	get_data <- function(url) {

	# Prevent unintentional DDoS
	Sys.sleep(10)

	# Scrape whole body
	body <- html(url)

	# Get field nodes, which are pairs of the label and value
	fields <- html_nodes(body, 'div.field')

	# Get labels
	info_labels <- fields %>%
	html_node('div.field-label') %>%
	html_text %>%
	sub(":\\W$", "", .)

	# Labels might be NA. First is Summary, the others are considered as noises
	info_labels[1] <- "Summary"
	info_labels[is.na(info_labels)] <- "NA"

	# Get items. It is no always that one label has one item; items can be multiple.
	info_items <- fields %>%
	map(~ html_nodes(., 'div.field-item') %>%
	html_text %>%
	paste(collapse = " ")) %>%
	gsub("(\\W</?p>\|\\n\\W)" , "", .)

	# Combine then into a list
	result <- as.list(info_items)
	names(result) <- info_labels
	result$title <- body %>%
	html_nodes('#page-title') %>%
	html_text

	# Return as a data_frame
	as_data_frame(result)
	}

	#==== Main ====

	# Get URLs of individual pages
	links <- list()

	pagecount <- as.numeric(get_pagecount())

	for(i in (length(links)):pagecount){
	cat("Scraping list pages", i, "...\n")
	links[[i]] <- get_links(i)
	}

	# flatten a list of lists into one array
	links <- unlist(links)


	# Scrape page by page
	result <- list()

	for(i in (length(result) + 1):length(links)){
	cat("Scraping detail pages", i, "...\n")
	result[[i]] <- get_data(links[i])
	}

	# Export data as CSV
	write.csv(bind_rows(result), file = "botswanatourism.csv", row.names = FALSE)
	write.csv(bind_rows(result)[,c(15,1,2,3,4,5,6,7,8,9,10,18)], file = "botswanatourism_minimal.csv", row.names = FALSE)