Skip to content

Instantly share code, notes, and snippets.

@yutannihilation
Last active August 29, 2015 14:15
Show Gist options
  • Save yutannihilation/9e54f84ada7104872ee3 to your computer and use it in GitHub Desktop.
Save yutannihilation/9e54f84ada7104872ee3 to your computer and use it in GitHub Desktop.
library(rvest)
library(httr)
library(dplyr)
library(purrr)
# get URLs of detail pages
get_links <- function(num) {
url <- sprintf("http://www.botswanatourism.co.bw/accommodation?field_facility_region_tid=All&title=&page=%d", num)
links <- html(url) %>%
html_nodes("td.views-field-title a") %>%
html_attr("href") %>%
paste0("http://www.botswanatourism.co.bw", .)
}
# get last page number
get_pagecount <- function() {
html("http://www.botswanatourism.co.bw/accommodation") %>%
html_node("li.pager-last a") %>%
html_attr("href") %>%
sub("^.*=([[:digit:]]+)$", "\\1", .)
}
# get data from each detail page
get_data <- function(url) {
# Prevent unintentional DDoS
Sys.sleep(10)
# Scrape whole body
body <- html(url)
# Get field nodes, which are pairs of the label and value
fields <- html_nodes(body, 'div.field')
# Get labels
info_labels <- fields %>%
html_node('div.field-label') %>%
html_text %>%
sub(":\\W$", "", .)
# Labels might be NA. First is Summary, the others are considered as noises
info_labels[1] <- "Summary"
info_labels[is.na(info_labels)] <- "NA"
# Get items. It is no always that one label has one item; items can be multiple.
info_items <- fields %>%
map(~ html_nodes(., 'div.field-item') %>%
html_text %>%
paste(collapse = " ")) %>%
gsub("(\\W*</?p>|\\n\\W*)" , "", .)
# Combine then into a list
result <- as.list(info_items)
names(result) <- info_labels
result$title <- body %>%
html_nodes('#page-title') %>%
html_text
# Return as a data_frame
as_data_frame(result)
}
#==== Main ====
# Get URLs of individual pages
links <- list()
pagecount <- as.numeric(get_pagecount())
for(i in (length(links)):pagecount){
cat("Scraping list pages", i, "...\n")
links[[i]] <- get_links(i)
}
# flatten a list of lists into one array
links <- unlist(links)
# Scrape page by page
result <- list()
for(i in (length(result) + 1):length(links)){
cat("Scraping detail pages", i, "...\n")
result[[i]] <- get_data(links[i])
}
# Export data as CSV
write.csv(bind_rows(result), file = "botswanatourism.csv", row.names = FALSE)
write.csv(bind_rows(result)[,c(15,1,2,3,4,5,6,7,8,9,10,18)], file = "botswanatourism_minimal.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment