Created
May 2, 2020 04:06
-
-
Save datalorax/388a3b9da5c67a49a3ce69684463fca7 to your computer and use it in GitHub Desktop.
Scrape NYT table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(RSelenium) | |
library(rvest) | |
library(tidyverse) | |
theme_set(theme_minimal(15) + | |
theme(plot.title.position = "plot", | |
plot.caption = element_text(color = "gray40", size = 8))) | |
url <- "https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html" | |
rd <- rsDriver(browser = "chrome", chromever = "81.0.4044.69") | |
driver <- rd$client | |
driver$navigate(url) | |
elements <- driver$findElements('//*[@id="clusters"]/div/button', using = "xpath") | |
Sys.sleep(2) | |
elements[[1]]$clickElement() | |
Sys.sleep(2) | |
elements[[1]]$clickElement() | |
parsed_pagesource <- driver$getPageSource()[[1]] | |
# using rvest to extract information | |
tbl <- read_html(parsed_pagesource) %>% | |
html_node(xpath = '//*[@id="clusters"]/div/table') %>% | |
html_table() %>% | |
as_tibble() %>% | |
rename(institution = `Cases Connected To`, | |
cases = Cases) %>% | |
mutate(cases = parse_number(cases)) | |
tbl | |
prison_terms <- "[Cc]orrection|[Jj]ail|[Pp]rison|[Dd]etention|[Ii]nstitution|[Pp]enitentiary" | |
meat_terms <- "[Pp]ork|[Bb]eef|[Mm]eat" | |
other_food <- "[Ff]ood|[Ff]arm" | |
nursinghome_terms <- "[Nn]ursing|[Ss]enior|[Gg]eriatric|[Aa]ssisted [Ll]iving|[Rr]etirement" | |
health_terms <- "[Hh]ealth|[Cc]are|[Hh]ospital|[Dd]evelopment|[Cc]onvalescent" | |
boat_terms <- "U\\.S\\.S|Grand Princess" | |
tbls <- tbl %>% | |
mutate( | |
is_prison = grepl(prison_terms, institution), | |
is_meat = grepl(meat_terms, institution) & !is_prison, | |
is_other_food = grepl(other_food, institution) & !is_prison & !is_meat, | |
is_nursing_home = grepl(nursinghome_terms, institution) & | |
!is_prison & !is_meat & !is_other_food, | |
is_rehab = grepl("[Rr]ehab", institution) & | |
!is_prison & !is_meat & !is_other_food & !is_nursing_home, | |
is_other_health = grepl(health_terms, institution) & | |
!is_prison & !is_meat & !is_other_food & | |
!is_nursing_home & !is_rehab, | |
is_boat = grepl(boat_terms, institution) & | |
!is_prison & !is_meat & !is_other_food & | |
!is_nursing_home & !is_rehab & !is_other_health, | |
is_other = !is_prison & !is_meat & !is_other_food & ! is_nursing_home & | |
!is_rehab & !is_other_health & !is_boat | |
) %>% | |
pivot_longer(cols = starts_with("is"), | |
names_to = "type", | |
names_prefix = "is_", | |
values_to = "lgl") %>% | |
filter(lgl) %>% | |
select(-lgl) %>% | |
mutate(type = stringr::str_to_title(gsub("_", " ", type))) | |
tbls %>% | |
count(type, wt = cases) %>% | |
mutate(type = fct_reorder(type, n)) %>% | |
ggplot(aes(type, n)) + | |
geom_col(fill = "cornflowerblue", | |
alpha = 0.8) + | |
scale_y_continuous(expand = c(0, 0)) + | |
coord_flip() + | |
labs(x = "", | |
y = "", | |
title = "Outbreaks of COVID-19", | |
subtitle = "Total cases connected to...", | |
caption = "Data collected from the NYT: https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment