-
-
Save yutannihilation/9e54f84ada7104872ee3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(httr) | |
library(dplyr) | |
library(purrr) | |
# get URLs of detail pages | |
get_links <- function(num) { | |
url <- sprintf("http://www.botswanatourism.co.bw/accommodation?field_facility_region_tid=All&title=&page=%d", num) | |
links <- html(url) %>% | |
html_nodes("td.views-field-title a") %>% | |
html_attr("href") %>% | |
paste0("http://www.botswanatourism.co.bw", .) | |
} | |
# get last page number | |
get_pagecount <- function() { | |
html("http://www.botswanatourism.co.bw/accommodation") %>% | |
html_node("li.pager-last a") %>% | |
html_attr("href") %>% | |
sub("^.*=([[:digit:]]+)$", "\\1", .) | |
} | |
# get data from each detail page | |
get_data <- function(url) { | |
# Prevent unintentional DDoS | |
Sys.sleep(10) | |
# Scrape whole body | |
body <- html(url) | |
# Get field nodes, which are pairs of the label and value | |
fields <- html_nodes(body, 'div.field') | |
# Get labels | |
info_labels <- fields %>% | |
html_node('div.field-label') %>% | |
html_text %>% | |
sub(":\\W$", "", .) | |
# Labels might be NA. First is Summary, the others are considered as noises | |
info_labels[1] <- "Summary" | |
info_labels[is.na(info_labels)] <- "NA" | |
# Get items. It is no always that one label has one item; items can be multiple. | |
info_items <- fields %>% | |
map(~ html_nodes(., 'div.field-item') %>% | |
html_text %>% | |
paste(collapse = " ")) %>% | |
gsub("(\\W*</?p>|\\n\\W*)" , "", .) | |
# Combine then into a list | |
result <- as.list(info_items) | |
names(result) <- info_labels | |
result$title <- body %>% | |
html_nodes('#page-title') %>% | |
html_text | |
# Return as a data_frame | |
as_data_frame(result) | |
} | |
#==== Main ==== | |
# Get URLs of individual pages | |
links <- list() | |
pagecount <- as.numeric(get_pagecount()) | |
for(i in (length(links)):pagecount){ | |
cat("Scraping list pages", i, "...\n") | |
links[[i]] <- get_links(i) | |
} | |
# flatten a list of lists into one array | |
links <- unlist(links) | |
# Scrape page by page | |
result <- list() | |
for(i in (length(result) + 1):length(links)){ | |
cat("Scraping detail pages", i, "...\n") | |
result[[i]] <- get_data(links[i]) | |
} | |
# Export data as CSV | |
write.csv(bind_rows(result), file = "botswanatourism.csv", row.names = FALSE) | |
write.csv(bind_rows(result)[,c(15,1,2,3,4,5,6,7,8,9,10,18)], file = "botswanatourism_minimal.csv", row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment