Created
May 27, 2020 19:08
-
-
Save jebyrnes/79478c48a142cc2f22359d8aaf871c9b to your computer and use it in GitHub Desktop.
Get the iso subdivision codes from wikipedia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(dplyr) | |
library(purrr) | |
library(tidyr) | |
# https://en.wikipedia.org/wiki/ISO_3166-2 | |
baseurl <- "https://en.wikipedia.org" | |
url <- "https://en.wikipedia.org/wiki/ISO_3166-2" | |
# get the tables from which spring links! | |
links <- read_html(url) %>% | |
html_nodes("table") %>% | |
`[`(1) %>% # first two tables | |
html_nodes("tr") %>% | |
html_nodes("a") %>% | |
html_attr("href") %>% | |
grep("ISO_3166", ., value = TRUE) | |
# weird duplicate col names that bind_rows does not want | |
# from tables where some columns have two headers | |
fix_dup_names <- function(adf) { | |
n <- names(adf) | |
if (length(unique(n)) == length(n)) { | |
return(adf) | |
} | |
names(adf) <- paste(n, 1:length(n), sep = "_") | |
names(adf)[n == "Code"] <- "Code" | |
adf[-1, ] | |
} | |
parse_one_page <- function(a_link, quiet = FALSE) { | |
if (!quiet) print(a_link) | |
read_html(paste0(baseurl, a_link)) %>% | |
html_nodes("body table.wikitable.sortable") %>% | |
html_table(fill = TRUE) %>% | |
map(fix_dup_names) %>% | |
data.table::rbindlist(fill = TRUE) %>% # used instead of bind_rows to deal with mixed classes | |
as_tibble() %>% | |
mutate(country_code = gsub("\\/wiki\\/ISO_3166-2\\:", "", a_link)) | |
} | |
reshape_tab <- function(adf, quiet = FALSE) { | |
if (!quiet) print(adf[1, ]) | |
adf %>% | |
mutate_all(as.character) %>% # sneaky integers sneaking in | |
pivot_longer( | |
cols = !matches("Code"), | |
names_to = "subdivision_name_type", | |
values_to = "subdivision_name" | |
) %>% | |
rename(code = Code) | |
} | |
tabs <- map(links, parse_one_page) | |
tabs <- discard(tabs, ~ nrow(.x) == 0) | |
iso_df <- map_df(tabs, reshape_tab) | |
# fix missing codes | |
iso_df_filtered <- iso_df %>% | |
filter(!is.na(subdivision_name)) %>% | |
mutate( | |
code = ifelse(is.na(code), `Former code`, code), | |
code = ifelse(is.na(code), `Alternative code`, code), | |
code = ifelse(is.na(code), `Netherlands ISO 3166-2 code`, code) | |
) %>% | |
# get rid of dups with no code | |
group_by(subdivision_name, country_code) %>% | |
filter(!is.na(code)) %>% | |
ungroup() %>% | |
# get rid of duplicate rows | |
group_by(code, subdivision_name, country_code) %>% | |
slice(1L) %>% | |
ungroup() %>% | |
# sift down | |
select(code, country_code, subdivision_name_type, subdivision_name) %>% | |
filter(grepl("[n,N]ame", subdivision_name_type) | subdivision_name_type== "Local variant") | |
iso_df_filtered |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment