cimentadaj · May 19, 2019 17:51
diff --git a/country-language.R b/country-language.R
 # This scrapes the language/country website from Wikipedia and tries to pick
 # the most spoken language from each country to generate a final data frame
 # with a country - language. Since some countries have several official
 # languages, I assume THE FIRST is the most spoken. This brings some error
 # like Brazil has German as first. However, almost 90% of the work is done. Some small
 # tweaks are needed.

 library(rvest)
 library(xml2)
 library(ISOcodes)
 library(tidyverse)
 webpage <- read_xml("https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory")

 table <-
  webpage %>% 
  xml_find_all("//table[contains(@class, 'wikitable')]") %>% 
  .[[1]] %>% 
  html_table(fill = TRUE, trim = FALSE, header = TRUE) %>% 
  as_tibble(.name_repair = "minimal")

 names(table) <- as.character(table[1, ])
 table <- table[2:nrow(table), ]

 cols_indx <- which(grepl(pattern = "Country|Official Language", table$Country))
 all_indx <- sort(c(cols_indx, cols_indx - 1))

 table_fix <- table[-all_indx, ]

 names(table_fix) <- paste0("V", 1:ncol(table_fix))

 fixed_table <-
  table_fix %>% 
  mutate(country = str_remove_all(V1, "\\[.*\\]|^\\s+|\\s+$") %>% str_replace_all("Russia", "Russian Federation"),
         V2 = str_replace_all(V2, "Nationwide:", ""),
         language = str_remove_all(V2, "\\[.*\\]|\\(.*\\)") %>% str_remove_all("^\\s+|\\s+$"),
         language = if_else(language == "", V5, language),
         language = str_remove_all(language, "\\[.*\\]|\\(.*\\)"),
         language = str_split(str_replace_all(language, "([[:upper:]])", "\n\\1"), "\\n", simplify = TRUE)[, 2],
         language = trimws(language)) %>% 
  select(country, language)

 iso_language <-
  ISO_639_3 %>% 
  as_tibble() %>% 
  select(Id, eng) %>%
  mutate(language = str_replace(eng, "Spanish \\(Castilian\\)", "Spanish"),
         language = str_replace_all(language, "Slovenian", "Slovene")) %>% 
  select(-eng) %>% 
  rename(`iso3` = Id)


 final_df <-
  fixed_table %>% 
  filter(country %in% all_countries) %>%
  left_join(iso_language) %>% 
  print(n = Inf)

 final_df
	# This scrapes the language/country website from Wikipedia and tries to pick
	# the most spoken language from each country to generate a final data frame
	# with a country - language. Since some countries have several official
	# languages, I assume THE FIRST is the most spoken. This brings some error
	# like Brazil has German as first. However, almost 90% of the work is done. Some small
	# tweaks are needed.

	library(rvest)
	library(xml2)
	library(ISOcodes)
	library(tidyverse)
	webpage <- read_xml("https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory")

	table <-
	webpage %>%
	xml_find_all("//table[contains(@class, 'wikitable')]") %>%
	.[[1]] %>%
	html_table(fill = TRUE, trim = FALSE, header = TRUE) %>%
	as_tibble(.name_repair = "minimal")

	names(table) <- as.character(table[1, ])
	table <- table[2:nrow(table), ]

	cols_indx <- which(grepl(pattern = "Country\|Official Language", table$Country))
	all_indx <- sort(c(cols_indx, cols_indx - 1))

	table_fix <- table[-all_indx, ]

	names(table_fix) <- paste0("V", 1:ncol(table_fix))

	fixed_table <-
	table_fix %>%
	mutate(country = str_remove_all(V1, "\\[.*\\]\|^\\s+\|\\s+$") %>% str_replace_all("Russia", "Russian Federation"),
	V2 = str_replace_all(V2, "Nationwide:", ""),
	language = str_remove_all(V2, "\\[.\\]\|\\(.\\)") %>% str_remove_all("^\\s+\|\\s+$"),
	language = if_else(language == "", V5, language),
	language = str_remove_all(language, "\\[.\\]\|\\(.\\)"),
	language = str_split(str_replace_all(language, "([[:upper:]])", "\n\\1"), "\\n", simplify = TRUE)[, 2],
	language = trimws(language)) %>%
	select(country, language)

	iso_language <-
	ISO_639_3 %>%
	as_tibble() %>%
	select(Id, eng) %>%
	mutate(language = str_replace(eng, "Spanish \\(Castilian\\)", "Spanish"),
	language = str_replace_all(language, "Slovenian", "Slovene")) %>%
	select(-eng) %>%
	rename(`iso3` = Id)


	final_df <-
	fixed_table %>%
	filter(country %in% all_countries) %>%
	left_join(iso_language) %>%
	print(n = Inf)

	final_df