Last active
May 14, 2024 12:03
-
-
Save bearloga/519a701a6a9bc7c3ba9f to your computer and use it in GitHub Desktop.
Scripts for scraping divorce demographics by country from Wikipedia and plotting it in R with ggplot2 with the respective country flags in place of points.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Script for scraping Wikipedia for data to use with the geom_flag() prototype | |
## CONTACT: Mikhail Popov (@bearloga // mikhail[at]mpopov[dot]com) | |
## URL: https://gist.github.com/bearloga/519a701a6a9bc7c3ba9f | |
# install.packages("import") | |
library(rvest) # install.packages("rvest") | |
library(magrittr) | |
import::from(dplyr, mutate, select, keep_where = filter, left_join, distinct) | |
population <- read_html("https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population") %>% | |
html_nodes("table") %>% | |
{ .[[1]] } %>% | |
html_table() %>% | |
mutate(Population = as.integer(gsub(",", "", Population)), | |
Country = sub("\\(.*\\)", "", gsub("\\[.*\\]", "", `Country (or dependent territory)`))) %>% | |
select(c(Country, Population)) %>% | |
distinct() | |
marriage_divorce <- read_html("https://en.wikipedia.org/wiki/Divorce_demography") %>% | |
html_nodes("table") %>% | |
{ .[[1]] } %>% | |
html_table() %>% | |
mutate(Country = stringr::str_trim(Country)) %>% | |
distinct() %>% | |
keep_where(!is.na(`Crude marriage rate`) & !is.na(`Crude divorce rate`)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Prototype for what might someday become geom_flag() for use with ggplot2 | |
## CONTACT: Mikhail Popov (@bearloga // mikhail[at]mpopov[dot]com) | |
## URL: https://gist.github.com/bearloga/519a701a6a9bc7c3ba9f | |
## LICENSE: MIT | |
## SVG Flag Collection by Jakob Flierl | |
## Prerequisites: | |
## 1. Install the R packages we'll be using in this script: | |
install.packages(c("magrittr", "dplyr", "rvest", "import", | |
"ISOcodes", "ggplot2", "grid", "png")) | |
## 2. Download the MIT-licensed flag-icon-css repository from GitHub: | |
git2r::clone("https://github.com/lipis/flag-icon-css", "flag-icon-css") | |
## 3. Convert SVGs to PNGs: | |
## Unfortunately, we can only use PNGs and not SVGs, so we need to convert the flags to PNGs, | |
## which requires inkscape (for example), either as an Application or as a MacPort. | |
## e.g. $> sudo port install inkscape | |
## To convert SVGs to PNGs, run the following commands in Terminal: | |
## $> cd /path/to/flag-icon-css/flags/4x3/ | |
## $> for f in *.svg; do g=$(echo $f | sed 's=svg=png='); inkscape -z -w 200 -h 150 -e $g $f; done | |
## $> mkdir ../pngs && mv *.png ../pngs/ | |
## ...this will take some time... | |
library(magrittr) | |
import::from(dplyr, mutate, left_join, keep_where = filter) | |
library(ggplot2) | |
## The flag PNGs are named following the ISO 3166-1 standard. Let's get the country names: | |
data("ISO_3166_1", package = "ISOcodes") | |
flags <- data.frame(filename = list.files("flag-icon-css/flags/pngs", "\\.png"), | |
stringsAsFactors = FALSE) %>% | |
mutate(filepath = file.path("flag-icon-css/flags/pngs", filename), | |
code = toupper(stringr::str_extract(filename, "^[a-z]{2}"))) %>% | |
left_join(ISO_3166_1[, c("Name", "Alpha_2")], by = c("code" = "Alpha_2")) %>% | |
set_names(c("filename", "filepath", "code", "country")) %>% | |
keep_where(!is.na(country)) | |
## Scrape data from Wikipedia (requires rvest & dplyr packages) | |
# source("data.R") | |
## To avoid problems later, let's restrict the scraped datasets | |
## to countries for which we have the flags for. | |
population %<>% keep_where(Country %in% flags$country) | |
marriage_divorce %<>% keep_where(Country %in% population$Country) | |
marriage_divorce %<>% left_join(population, by = "Country") | |
## Let's make the base ggplot that we'll add the flags to later: | |
gg <- ggplot(data = marriage_divorce, | |
aes(x = `Crude marriage rate`, | |
y = `Crude divorce rate`)) + | |
geom_point(alpha = 0.1) + | |
labs(title = "Divorce demography of countries", | |
x = "Number of marriages per 1,000 population", | |
y = "Number of divorces per 1,000 population") | |
# print(gg) # To test that everything so far has worked as it should. | |
## This next step is going to take a while. We do this to load the flags into the memory | |
## as grobs so we can have an easier and faster time adding them to any ggplots. | |
flag_grobs <- lapply(flags$country, function(country) { | |
return(grid::rasterGrob(png::readPNG(flags$filepath[flags$country == country]))) | |
}) | |
names(flag_grobs) <- flags$country | |
## The following chunk will use the data to inform where the flags should be added: | |
pseudogeom_flag <- mapply(function(x, y, country, flag_scale = 1/4) { | |
return(annotation_custom(flag_grobs[[country]], | |
x - 2 * flag_scale, x + 2 * flag_scale, | |
y - 1.5 * flag_scale, y + 1.5 * flag_scale)) | |
## The final annotation is a flag with an aspect ratio of 4:3, centered at the (x, y). | |
}, | |
x = marriage_divorce$`Crude marriage rate`, | |
y = marriage_divorce$`Crude divorce rate`, | |
country = marriage_divorce$Country) | |
## Plot it! | |
(gg <- gg + pseudogeom_flag) | |
## We can also make it look a little better by applying the following theme: | |
(gg <- gg + ggthemes::theme_tufte(base_family = "Gill Sans") + | |
theme(panel.grid = element_line(color = "gray80", size = 1))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment