Skip to content

Instantly share code, notes, and snippets.

@datalorax
Created May 2, 2020 04:06
Show Gist options
  • Save datalorax/388a3b9da5c67a49a3ce69684463fca7 to your computer and use it in GitHub Desktop.
Save datalorax/388a3b9da5c67a49a3ce69684463fca7 to your computer and use it in GitHub Desktop.
Scrape NYT table
library(RSelenium)
library(rvest)
library(tidyverse)
theme_set(theme_minimal(15) +
theme(plot.title.position = "plot",
plot.caption = element_text(color = "gray40", size = 8)))
url <- "https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html"
rd <- rsDriver(browser = "chrome", chromever = "81.0.4044.69")
driver <- rd$client
driver$navigate(url)
elements <- driver$findElements('//*[@id="clusters"]/div/button', using = "xpath")
Sys.sleep(2)
elements[[1]]$clickElement()
Sys.sleep(2)
elements[[1]]$clickElement()
parsed_pagesource <- driver$getPageSource()[[1]]
# using rvest to extract information
tbl <- read_html(parsed_pagesource) %>%
html_node(xpath = '//*[@id="clusters"]/div/table') %>%
html_table() %>%
as_tibble() %>%
rename(institution = `Cases Connected To`,
cases = Cases) %>%
mutate(cases = parse_number(cases))
tbl
prison_terms <- "[Cc]orrection|[Jj]ail|[Pp]rison|[Dd]etention|[Ii]nstitution|[Pp]enitentiary"
meat_terms <- "[Pp]ork|[Bb]eef|[Mm]eat"
other_food <- "[Ff]ood|[Ff]arm"
nursinghome_terms <- "[Nn]ursing|[Ss]enior|[Gg]eriatric|[Aa]ssisted [Ll]iving|[Rr]etirement"
health_terms <- "[Hh]ealth|[Cc]are|[Hh]ospital|[Dd]evelopment|[Cc]onvalescent"
boat_terms <- "U\\.S\\.S|Grand Princess"
tbls <- tbl %>%
mutate(
is_prison = grepl(prison_terms, institution),
is_meat = grepl(meat_terms, institution) & !is_prison,
is_other_food = grepl(other_food, institution) & !is_prison & !is_meat,
is_nursing_home = grepl(nursinghome_terms, institution) &
!is_prison & !is_meat & !is_other_food,
is_rehab = grepl("[Rr]ehab", institution) &
!is_prison & !is_meat & !is_other_food & !is_nursing_home,
is_other_health = grepl(health_terms, institution) &
!is_prison & !is_meat & !is_other_food &
!is_nursing_home & !is_rehab,
is_boat = grepl(boat_terms, institution) &
!is_prison & !is_meat & !is_other_food &
!is_nursing_home & !is_rehab & !is_other_health,
is_other = !is_prison & !is_meat & !is_other_food & ! is_nursing_home &
!is_rehab & !is_other_health & !is_boat
) %>%
pivot_longer(cols = starts_with("is"),
names_to = "type",
names_prefix = "is_",
values_to = "lgl") %>%
filter(lgl) %>%
select(-lgl) %>%
mutate(type = stringr::str_to_title(gsub("_", " ", type)))
tbls %>%
count(type, wt = cases) %>%
mutate(type = fct_reorder(type, n)) %>%
ggplot(aes(type, n)) +
geom_col(fill = "cornflowerblue",
alpha = 0.8) +
scale_y_continuous(expand = c(0, 0)) +
coord_flip() +
labs(x = "",
y = "",
title = "Outbreaks of COVID-19",
subtitle = "Total cases connected to...",
caption = "Data collected from the NYT: https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment