Last active
May 19, 2019 17:51
-
-
Save cimentadaj/c36d0d02dbe068d7a419713a1aaf1a60 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This scrapes the language/country website from Wikipedia and tries to pick | |
# the most spoken language from each country to generate a final data frame | |
# with a country - language. Since some countries have several official | |
# languages, I assume THE FIRST is the most spoken. This brings some error | |
# like Brazil has German as first. However, almost 90% of the work is done. Some small | |
# tweaks are needed. | |
library(rvest) | |
library(xml2) | |
library(ISOcodes) | |
library(tidyverse) | |
webpage <- read_xml("https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory") | |
table <- | |
webpage %>% | |
xml_find_all("//table[contains(@class, 'wikitable')]") %>% | |
.[[1]] %>% | |
html_table(fill = TRUE, trim = FALSE, header = TRUE) %>% | |
as_tibble(.name_repair = "minimal") | |
names(table) <- as.character(table[1, ]) | |
table <- table[2:nrow(table), ] | |
cols_indx <- which(grepl(pattern = "Country|Official Language", table$Country)) | |
all_indx <- sort(c(cols_indx, cols_indx - 1)) | |
table_fix <- table[-all_indx, ] | |
names(table_fix) <- paste0("V", 1:ncol(table_fix)) | |
fixed_table <- | |
table_fix %>% | |
mutate(country = str_remove_all(V1, "\\[.*\\]|^\\s+|\\s+$") %>% str_replace_all("Russia", "Russian Federation"), | |
V2 = str_replace_all(V2, "Nationwide:", ""), | |
language = str_remove_all(V2, "\\[.*\\]|\\(.*\\)") %>% str_remove_all("^\\s+|\\s+$"), | |
language = if_else(language == "", V5, language), | |
language = str_remove_all(language, "\\[.*\\]|\\(.*\\)"), | |
language = str_split(str_replace_all(language, "([[:upper:]])", "\n\\1"), "\\n", simplify = TRUE)[, 2], | |
language = trimws(language)) %>% | |
select(country, language) | |
iso_language <- | |
ISO_639_3 %>% | |
as_tibble() %>% | |
select(Id, eng) %>% | |
mutate(language = str_replace(eng, "Spanish \\(Castilian\\)", "Spanish"), | |
language = str_replace_all(language, "Slovenian", "Slovene")) %>% | |
select(-eng) %>% | |
rename(`iso3` = Id) | |
final_df <- | |
fixed_table %>% | |
filter(country %in% all_countries) %>% | |
left_join(iso_language) %>% | |
print(n = Inf) | |
final_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment