Skip to content

Instantly share code, notes, and snippets.

@andybega
Last active October 2, 2019 19:58
Show Gist options
  • Save andybega/0f16b6a0de0185c157e2a280b5e05074 to your computer and use it in GitHub Desktop.
Save andybega/0f16b6a0de0185c157e2a280b5e05074 to your computer and use it in GitHub Desktop.
Scrape the Wikipedia terror incidents pages/lists
#
# copy and run this code in R to setup and source `wikipedia-terrorism.R` from this gist
#
packs <- c("dplyr", "rvest", "httr", "XML", "stringr", "purrr", "xml2", "tibble", "devtools")
need <- packs[!packs %in% rownames(installed.packages())]
install.packages(need)
devtools::source_gist("https://gist.github.com/andybega/0f16b6a0de0185c157e2a280b5e05074",
filename = "wikipedia-terrorism.R")
library("dplyr")
wikipedia_terrorism_scrape(from = "2019-08")
#
# Scrape the Wikipedia terror incidents pages/lists
# 2019-10-02
#
# this relies on a hard coded URL for the terrorism page list template page,
# which includes URLs for all relevant pages with data tables
# https://en.wikipedia.org/wiki/Template:Lists_of_Terrorist_Incidents
#
# dependencies that should be installed for this to work:
# c("dplyr", "rvest", "httr", "XML", "stringr", "purrr", "xml2", "tibble")
#
# only the pipe operator needs to be actually attached, all other functions are
# called with package::func(), so it should not be neccessary to actually load
# the packages
#
# try:
#
# packs <- c("dplyr", "rvest", "httr", "XML", "stringr", "purrr", "xml2", "tibble")
# need <- packs[!packs %in% rownames(installed.packages())]
# install.packages(need)
# library("dplyr")
#
# usage:
#
# source this script, then:
#
# wikipedia_terrorism_scrape(from = "2019-08")
#
#' @param from year or "YYYY-MM" string
#' @param to year or "YYYY-MM" string, if NULL will default to latest
#'
#' @export
wikipedia_terrorism_scrape <- function(from = 2019, to = NULL) {
# parse from and to dates
if (nchar(from)==4) from <- as.Date(paste0(from, "-01-01"))
if (nchar(from)==7) from <- as.Date(paste0(from, "-01"))
if (!inherits(from, "Date")) {
stop("Could not parse 'from', form should be 'YYYY' or 'YYYY-MM'")
}
if (is.null(to)) to <- 2100
if (nchar(to)==4) to <- as.Date(paste0(to, "-01-01"))
if (nchar(to)==7) to <- as.Date(paste0(to, "-01"))
if (!inherits(to, "Date")) {
stop("Could not parse 'to', form should be 'YYYY' or 'YYYY-MM'")
}
urls <- wikipedia_terrorism_pages()
# extract normalized date from URL
url_date <- wikipedia_terrorism_parse_url_date(urls)[["start"]]
want <- url_date >= from & url_date <= to
urls <- urls[want]
# the actual meat and potatoes...loop through and get the pages/tables/data
datatbl <- list()
for (i in seq_along(urls)) {
tbl_i <- wikipedia_terrorism_scrape_table(urls[i])
datatbl[[i]] <- tbl_i
}
datatbl <- dplyr::bind_rows(datatbl)
datatbl
}
#' @export
wikipedia_terrorism_pages <- function() {
url <- "https://en.wikipedia.org/wiki/Template:Lists_of_Terrorist_Incidents"
page <- xml2::read_html(url)
page_urls <- rvest::html_nodes(page, "a") %>%
rvest::html_attr("href") %>%
purrr::keep(stringr::str_detect(., "terrorist_incidents_in")) %>%
paste0("https://en.wikipedia.org/", .)
# from 2015 on, there are month pages but also summary year pages
# exclude the year pages to avoid duplication
exclude <- page_urls %>%
stringr::str_extract(., "in_[0-9]{4}$") %>%
stringr::str_extract(., "[0-9]+") %>%
as.integer() %>%
sapply(., function(x) isTRUE(x >= 2015))
page_urls <- page_urls[!exclude]
page_urls
}
#' @export
wikipedia_terrorism_scrape_table <- function(url) {
page <- httr::GET(url)
# a page can include multiple tables, some of which may be irrelevant
all_tables <- XML::readHTMLTable(doc = httr::content(page, "text"),
stringsAsFactors = FALSE,
header = TRUE)
# filter out irrelevant tables
all_tables <- id_relevant_tables(all_tables)
table_ids <- names(all_tables)
# iterate through data tables and clean
datatbl <- list()
for (i in seq_along(all_tables)) {
tbl_i <- tibble::as_tibble(all_tables[[i]])
table_id_i <- table_ids[i]
date_stem <- wikipedia_terrorism_parse_table_date(table_id_i, url)
colnames(tbl_i) <- colnames(tbl_i) %>%
tolower() %>%
stringr::str_replace_all(" ", "_") %>%
stringr::str_replace_all("\n", "")
# replace em-dash with regular dash for dates like 1-3 January
tbl_i$date <- stringr::str_replace(tbl_i$date, "\u2013", "-")
# some tables have dates like "1", "1-3"; others include the month,
# like "January 1"
# if no month, use the date stem from ULR to id
month <- ifelse(stringr::str_detect(tbl_i$date, "^[0-9\\-]+$"),
date_stem$month,
tbl_i$date %>%
stringr::str_extract("^[A-Za-z]+") %>%
match(month.name)
)
# take out month in date if present, leaving only day
# keep orig for debugging dump if parsing failures
date_orig <- tbl_i$date
tbl_i$date <- tbl_i$date %>%
stringr::str_extract("[0-9\\-]+$")
# for multi-day dates, keep first day and record end date in end_date var
tbl_i$end_date <- tbl_i$date %>%
stringr::str_extract(., "-[0-9]{1,2}$") %>%
stringr::str_remove("-") %>%
ifelse(!is.na(.), paste0(month, .), .) %>%
as.Date(format = "%Y-%m-%d")
tbl_i$date <- tbl_i$date %>%
stringr::str_extract(., "^[0-9]+") %>%
paste(date_stem$year, month, ., sep = "-") %>%
as.Date(format = "%Y-%m-%d")
# parse minimum fatality count
tbl_i$dead_min <- tbl_i$dead %>%
stringr::str_extract(., "^[0-9]+") %>%
as.numeric()
# warn if there were parsing failures
if (any(is.na(tbl_i$date))) {
idx <- is.na(tbl_i$date)
payload <- tibble::as_tibble(cbind(orig = date_orig[idx],
tbl_i[idx, "date"],
stringsAsFactors = FALSE))
payload <- paste(capture.output(print(payload)), collapse = "\n")
warning("Some dates were not parsed\n",
"Table: ", table_id_i, "\n",
payload)
}
if (any(is.na(tbl_i$dead_min))) {
payload <- tbl_i[is.na(tbl_i$dead_min), c("dead", "dead_min")]
payload <- paste(capture.output(print(payload)), collapse = "\n")
warning("Some fatality figures were not parsed\n",
"Table: ", table_id_i, "\n",
payload)
}
# finished with this chunk
datatbl[[i]] <- tbl_i
}
datatbl <- dplyr::bind_rows(datatbl)
datatbl
}
wikipedia_terrorism_parse_table_date <- function(table_id, url) {
stopifnot(length(table_id)==1 | is.null(table_id),
length(url)==1)
if (length(table_id)==1) {
if (table_id=="NULL") table_id <- NULL
}
url_dates <- wikipedia_terrorism_parse_url_date(url)
if (is.null(table_id)) {
stem <- list(year = as.integer(substr(url_dates$start, 1, 4)),
month = NA)
} else {
# table_id like terrorIncidents2011Feb
table_id_end <- stringr::str_extract(table_id, "[0-9]{4}[A-Za-z]{3}")
year <- as.integer(substr(table_id_end, 1, 4))
month <- match(substr(table_id_end, 5, 7), month.abb)
# sometimes the table can have an incorrect ID on wikipedia, i.e.
# page ULR for January 2019 events but the table ID ends in something
# other
# check for this and warn
if (url_dates$start==url_dates$end) {
url_month <- as.integer(substr(url_dates$start, 6, 7))
if (url_month!=month) {
warning("Wikipedia table ID date does not match implied URL date, using URL date:\n",
"Table ID: ", table_id, "\n",
"URL: ", url)
month <- url_month
}
}
stem <- list(year = year, month = month)
}
stem
}
wikipedia_terrorism_parse_url_date <- function(x) {
# there are three kinds of URLs
# year only: https://en.wikipedia.org//wiki/List_of_terrorist_incidents_in_2006
# year and month range: https://en.wikipedia.org//wiki/List_of_terrorist_incidents_in_January%E2%80%93June_2011
# year and month: List_of_terrorist_incidents_in_September_2019
xbase <- basename(x)
dates <- lapply(xbase, function(x) {
year <- x %>%
stringr::str_extract("[0-9]{4}$") %>%
as.integer()
if (stringr::str_detect(x, "incidents_in_[0-9]{4}$")) {
start_date <- as.Date(paste0(year, "-01-01"))
end_date <- as.Date(paste0(year, "-12-01"))
}
if (stringr::str_detect(x, "%E2%80%93")) {
month1 <- x %>%
stringr::str_extract("[A-Za-z]+%E2%80%93") %>%
stringr::str_remove("%E2%80%93")
month2 <- x %>%
stringr::str_extract("%E2%80%93[A-Za-z]+") %>%
stringr::str_remove("%E2%80%93")
start_date <- as.Date(paste0(year, "-", month1, "-01"), format = "%Y-%B-%d")
end_date <- as.Date(paste0(year, "-", month2, "-01"), format = "%Y-%B-%d")
}
if (stringr::str_detect(x, "_in_[A-Za-z]+_[0-9]+$")) {
month <- x %>%
stringr::str_extract(., "_[A-Za-z]+_[0-9]{4}$") %>%
stringr::str_extract("[A-Za-z]+")
start_date <- end_date <- as.Date(paste0(year, "-", month, "-01"),
format = "%Y-%B-%d")
}
tibble::tibble(start = start_date, end = end_date)
})
# put it together
dplyr::bind_rows(dates)
}
id_relevant_tables <- function(x) {
# for year pages, e.g. 2011, the tables don't have names
# for year-month pages the tables usually have names to ID them
# easier to check if the first column has the name "Date\n"
yes <- sapply(x, function(xx) {
isTRUE(names(xx)[1]=="Date\n")
})
if (!any(yes)) {
stop("Something is wrong, could not find any relevant tables")
}
x[yes]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment