Skip to content

Instantly share code, notes, and snippets.

@cimentadaj
Last active May 19, 2019 17:51
Show Gist options
  • Save cimentadaj/c36d0d02dbe068d7a419713a1aaf1a60 to your computer and use it in GitHub Desktop.
Save cimentadaj/c36d0d02dbe068d7a419713a1aaf1a60 to your computer and use it in GitHub Desktop.
# This scrapes the language/country website from Wikipedia and tries to pick
# the most spoken language from each country to generate a final data frame
# with a country - language. Since some countries have several official
# languages, I assume THE FIRST is the most spoken. This brings some error
# like Brazil has German as first. However, almost 90% of the work is done. Some small
# tweaks are needed.
library(rvest)
library(xml2)
library(ISOcodes)
library(tidyverse)
webpage <- read_xml("https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory")
table <-
webpage %>%
xml_find_all("//table[contains(@class, 'wikitable')]") %>%
.[[1]] %>%
html_table(fill = TRUE, trim = FALSE, header = TRUE) %>%
as_tibble(.name_repair = "minimal")
names(table) <- as.character(table[1, ])
table <- table[2:nrow(table), ]
cols_indx <- which(grepl(pattern = "Country|Official Language", table$Country))
all_indx <- sort(c(cols_indx, cols_indx - 1))
table_fix <- table[-all_indx, ]
names(table_fix) <- paste0("V", 1:ncol(table_fix))
fixed_table <-
table_fix %>%
mutate(country = str_remove_all(V1, "\\[.*\\]|^\\s+|\\s+$") %>% str_replace_all("Russia", "Russian Federation"),
V2 = str_replace_all(V2, "Nationwide:", ""),
language = str_remove_all(V2, "\\[.*\\]|\\(.*\\)") %>% str_remove_all("^\\s+|\\s+$"),
language = if_else(language == "", V5, language),
language = str_remove_all(language, "\\[.*\\]|\\(.*\\)"),
language = str_split(str_replace_all(language, "([[:upper:]])", "\n\\1"), "\\n", simplify = TRUE)[, 2],
language = trimws(language)) %>%
select(country, language)
iso_language <-
ISO_639_3 %>%
as_tibble() %>%
select(Id, eng) %>%
mutate(language = str_replace(eng, "Spanish \\(Castilian\\)", "Spanish"),
language = str_replace_all(language, "Slovenian", "Slovene")) %>%
select(-eng) %>%
rename(`iso3` = Id)
final_df <-
fixed_table %>%
filter(country %in% all_countries) %>%
left_join(iso_language) %>%
print(n = Inf)
final_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment