Created
April 15, 2020 08:04
-
-
Save johnburnmurdoch/1d23978fc8213ff656d9f629608dd1f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Install and load required packages | |
install.packages("needs") | |
library(needs) | |
needs(tidyverse, magrittr, animation, pdftools, png, scales) | |
# Function that extracts data from Google Mobility PDFs | |
process_google_mobility <- function(country_code, start_date, end_date){ | |
# Convert first page of PDF into high-res PNG | |
pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 1, dpi = 300, filenames = "IMG1.png") | |
# Convert second page of PDF into high-res PNG | |
pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 2, dpi = 300, filenames = "IMG2.png") | |
# Read PDF page one image into R as an array of red, green and blue pixel values | |
img <- readPNG("IMG1.png") | |
# Rescale the pixel values from 0-1 into 0-255 | |
r <- img[,,1] * 255 | |
g <- img[,,2] * 255 | |
b <- img[,,3] * 255 | |
# rgb(66,133,244) is the rgb specification of the blue line used in Google’s charts | |
# retail chart extent is from pixel 1500 to 1786 vertically, and from 853 to 1328 left to right, so we loop through those pixels from left to right, and in each case | |
val <- c(853:1328) %>% | |
map_dbl(~{ | |
# find the vertical pixels that match our specified rgb() colour, and average them to get the middle pixel of the line on the chart | |
mean(which( | |
r[1500:1822,.x]==66 & g[1500:1822,.x]==133 & b[1500:1822,.x]==244 | |
)) | |
}) %>% | |
# rescale this value so instead of being a pixel number, it’s a value from -100 to 80 (the domain of the y-axis) | |
scales::rescale(to=c(80, -100), from = c(1, 322)) | |
# then generate a series of data values equal to the number of horizontal chart pixels, so we can match a date/time to every y-axis value | |
date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val)) | |
# then join values to dates, summarise by individual day (one value per day), and add columns specifying the chart’s subject matter and the country | |
retail <- tibble(date, val) %>% | |
group_by(date) %>% | |
summarise(val = mean(val)) %>% | |
mutate(measure = "Retail", country_code) | |
# then repeat for parks and transit | |
# parks | |
val <- c(853:1328) %>% | |
map_dbl(~{ | |
mean(which( | |
r[2476:2798,.x]==66 & g[2476:2798,.x]==133 & b[2476:2798,.x]==244 | |
)) | |
}) %>% | |
scales::rescale(to=c(80, -100), from = c(1, 322)) | |
date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val)) | |
parks <- tibble(date, val) %>% | |
group_by(date) %>% | |
summarise(val = mean(val)) %>% | |
mutate(measure = "Parks", country_code) | |
# transit is on page two, so we need to load in the new image | |
img <- readPNG("IMG2.png") | |
r <- img[,,1] * 255 | |
g <- img[,,2] * 255 | |
b <- img[,,3] * 255 | |
# transit | |
val <- c(785:1259) %>% | |
map_dbl(~{ | |
mean(which( | |
r[222:544,.x]==66 & g[222:544,.x]==133 & b[222:544,.x]==244 | |
)) | |
}) %>% | |
scales::rescale(to=c(80, -100), from = c(1, 322)) | |
date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val)) | |
transit <- tibble(date, val) %>% | |
group_by(date) %>% | |
summarise(val = mean(val)) %>% | |
mutate(measure = "Transit", country_code) | |
# then combine retail, parks and transit into one table, and return this | |
return(bind_rows(retail, parks, transit)) | |
} | |
# Loop through countries of interest, processing Google’s two reports (to date) for each of them, and joining them together | |
countries <- c("GB", "US", "FR", "DE", "ES", "IT", "NO", "NL", "SE", "BE", "AT") | |
countries %>% | |
map_dfr(~{ | |
data <- bind_rows( | |
process_google_mobility(.x, "2020-02-16", "2020-03-29"), | |
process_google_mobility(.x, "2020-02-23", "2020-04-05") | |
) | |
}) -> GCM_master | |
# PLot this dataset as small multiples; one chart for each topic, one line on each per country | |
GCM_master %>% | |
# Get rid of any missing data | |
filter(!is.na(val)) %>% | |
# Make sure dates are read as dates (days) not date-times | |
mutate( | |
date = as.character(as.Date(date, "%Y-%m-%d")), | |
date = as.Date(date) | |
) %>% | |
# Make sure we only have one value per country per topic per day | |
group_by(measure, date, country_code) %>% | |
summarise(val = mean(val)) %>% | |
group_by(measure, country_code) %>% | |
# Create a smoothed moving average to iron out noise in the daily data (if you want) | |
mutate(smoothed = roll_meanr(val, 7)) %>% | |
# Filter for only a subset of countries of interest for this plot | |
filter(country_code %in% c("GB", "IT", "FR", "ES", "SE", "DE", "US")) %>% | |
# Plot moving average (or raw data) vs date | |
ggplot(aes(date, smoothed)) + | |
# Add a dark grid line for zero on the y-asix | |
geom_hline(yintercept = 0) + | |
# Draw one line per country | |
geom_line(aes(col = country_code), size=0.5) + | |
# Add a highlight for a country of focus, first adding a thicker white line to create a border behind the main one | |
geom_line(size=1.5, data = . %>% filter(country_code == "GB"), col = "white") + | |
geom_line(size=1, data = . %>% filter(country_code == "GB"), col = "black") + | |
# Add country labels to the end of each line | |
geom_text_repel(aes(col = country_code, label = country_code), direction = "y", data = . %>% group_by(country_code, measure) %>% top_n(1, date), hjust=0) + | |
# Clean up the x-axis | |
scale_x_date(limits = c(as.Date("2020-02-21"), as.Date("2020-04-06")), breaks = c(as.Date("2020-02-22"), as.Date("2020-04-05")), labels = function(x)format(x,"%d %b")) + | |
# Duplicate y-axis for easy reading of values | |
scale_y_continuous(limits=c(-100,40), breaks = seq(-80, 40, 20), expand=c(0,0), sec.axis = dup_axis()) + | |
# Small multiples: one chart per topic | |
facet_wrap(~measure, nrow=1) + | |
# Clean up the plot theme | |
theme_minimal() + | |
theme( | |
panel.grid.minor = element_blank(), | |
panel.grid.major.x = element_blank(), | |
axis.ticks = element_line(), | |
axis.title = element_blank(), | |
legend.position = "none" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment