Skip to content

Instantly share code, notes, and snippets.

@andrewzeitlin
Last active September 21, 2021 04:31
Show Gist options
  • Save andrewzeitlin/9c4dfc47d02f6e1e86e8866b491bb7d7 to your computer and use it in GitHub Desktop.
Save andrewzeitlin/9c4dfc47d02f6e1e86e8866b491bb7d7 to your computer and use it in GitHub Desktop.
DCPS covid cases
# Scrape and visualize COVID case data from DCPS website.
library(here)
library(rvest)
library(dplyr)
library(stringr)
library(lubridate)
library(ggplot2)
library(directlabels)
url <- "https://coronavirus.dc.gov/node/1506966"
# List of dates
dates <- read_html(url) %>%
html_nodes("strong") %>%
html_text()
dates <- dates[!str_detect(dates, fixed("DCPS", ignore_case=TRUE))] # Clean unwanted headers from the dates list
# Statistics re those dates
stats <- read_html(url) %>%
html_nodes('#node-page-1506966 li') %>%
html_text()
stats.cases.staff <- stats[str_detect(stats, fixed("personnel", ignore_case=TRUE)) &
str_detect(stats, fixed("positive", ignore_case=TRUE)) &
!str_detect(stats, fixed("pending", ignore_case=TRUE))
]
stats.quarantine.staff <- stats[str_detect(stats, fixed("personnel", ignore_case=TRUE)) & str_detect(stats, fixed("quarantine", ignore_case=TRUE))]
stats.cases.students <- stats[str_detect(stats, fixed("students", ignore_case=TRUE)) & str_detect(stats, fixed("positive", ignore_case=TRUE)) & !str_detect(stats, fixed("pending", ignore_case=TRUE))]
stats.quarantine.students <- stats[str_detect(stats, fixed("students", ignore_case=TRUE)) & str_detect(stats, fixed("quarantine", ignore_case=TRUE))]
# Confirm that resulting data are rectangular
lengths <- c(
length(dates),
length(stats.cases.staff),
length(stats.quarantine.staff),
length(stats.cases.students),
length(stats.quarantine.students)
)
print(lengths)
dcps <- data.frame(dates, stats.cases.staff, stats.quarantine.staff, stats.cases.students, stats.quarantine.students) %>%
rename(date=dates,
cases.staff=stats.cases.staff,
cases.students=stats.cases.students,
quarantine.staff=stats.quarantine.staff,
quarantine.students=stats.quarantine.students
)
# Clean variables
dcps$date <- mdy(dcps$date)
dcps$cases.staff <- gsub("[^0-9.-]", "", dcps$cases.staff) %>% as.numeric()
dcps$cases.students <- gsub("[^0-9.-]", "", dcps$cases.students) %>% as.numeric()
dcps$quarantine.staff <- gsub("[^0-9.-]", "", dcps$quarantine.staff) %>% as.numeric()
dcps$quarantine.students <- gsub("[^0-9.-]", "", dcps$quarantine.students) %>% as.numeric()
# Derived variables
## Dates of term
start.21.t1 <- mdy("August 30, 2021") # Start of 2021-22 T1
start.20.t4 <- mdy("April 19, 2021") # Start of 2020-21 T4
start.20.t3 <- mdy("February 1, 2021") # Start of 2020-21-T3
start.20.t2 <- mdy("November 9, 2020") # Start of 2020-21-T2
start.20.t1 <- mdy("August 31, 2020") # Start of 2020-21-T1
## School year
dcps$year[dcps$date >= start.21.t1] <- 2021
dcps$year[dcps$date < start.21.t1] <- 2020
## School term
dcps$term[dcps$date >= start.21.t1 ] <- 5
dcps$term[dcps$date>= start.20.t4 & dcps$date < start.21.t1] <- 4
dcps$term[dcps$date>=start.20.t3 & dcps$date < start.20.t4] <- 3
dcps$term[dcps$date>=start.20.t2 & dcps$date < start.20.t3] <- 2
dcps$term[dcps$date>=start.20.t1 & dcps$date < start.20.t2] <- 1
## Dates relative to start of term
dcps$t[dcps$year==2021 & dcps$term==5] <- dcps$date[dcps$year==2021 & dcps$term==5] - start.21.t1
dcps$t[dcps$year==2020 & dcps$term==4] <- dcps$date[dcps$year==2020 & dcps$term==4] - start.20.t4
dcps$t[dcps$year==2020 & dcps$term==3] <- dcps$date[dcps$year==2020 & dcps$term==3] - start.20.t3
dcps$t[dcps$year==2020 & dcps$term==2] <- dcps$date[dcps$year==2020 & dcps$term==2] - start.20.t2
dcps$t[dcps$year==2020 & dcps$term==1] <- dcps$date[dcps$year==2020 & dcps$term==1] - start.20.t1
## Cumulative cases from start of term
dcps <- dcps %>% group_by(term) %>%
mutate(
starting.cases.students = min(cases.students, na.rm=T)
)
# If lag cases exceed current cases, they were reset, and current cases become cumulative.
dcps$cases.students[dcps$term==4] <- dcps$cases.students[dcps$term==4] - max(dcps$cases.students[dcps$term==3]) # Reset counter by subtracting T3 final count
dcps$cases.students[dcps$term==3] <- dcps$cases.students[dcps$term==3] - max(dcps$cases.students[dcps$term==2])
# Plot cumulative cases in 2021-T1 vs 2020-T4
T <- max(dcps$t[dcps$term==5])
labels <- data.frame(
x = c(T+4, T+4),
y = c(max(dcps$cases.students[dcps$term==5 & dcps$t<=T])+25, max(dcps$cases.students[dcps$term==4 & dcps$t <= T])+25),
text = c('2021/22 Term 1', '2020/21 Term 4')
)
p <- ggplot(data=dcps[dcps$term==5 & dcps$t<=T,], aes(x=t, y=cases.students)) +
geom_point(, color='red') +
geom_line(, color='red') +
geom_point(data=dcps[dcps$term==4 & dcps$t <= T,], aes(x=t, y=cases.students), color='blue') +
geom_line(data=dcps[dcps$term==4 & dcps$t <= T,], aes(x=t, y=cases.students), linetype='dashed', color='blue') +
ylab('Cumulative cases') +
xlab('Days since start of term') +
labs(
title="Cumulative Student Cases Since Start of Term",
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data"
) +
theme_minimal() +
geom_text(data=labels, aes(x,y,label=text, hjust="inward"))
p
ggsave(here('figures','now_vs_t4.png'))
# Plot 2021-22 outcomes by themselves
## T1 Student Cases
p.student.cases <- ggplot(dcps[dcps$year==2021,], aes(x=date, y=cases.students)) +
geom_line(, color='red') +
geom_point() +
ylab("Cases") +
theme_minimal() +
labs(
title="DCPS Reported Student Cases",
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data"
)
p.student.cases
ggsave(here('figures','cases.students.png'),
width=6.5,
height=4.5,
units="in")
## T1 Student Quarantines
p.student.quarantine <- ggplot(dcps[dcps$year==2021,],
aes(x=date, y=quarantine.students)) +
geom_line(, color='red', linetype='dashed') + geom_point() +
ylab("Students in quarantine") +
theme_minimal()+
labs(
title="DCPS Reported Students Currently in Quarantine",
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data"
)
p.student.quarantine
ggsave(here('figures','quarantine.students.png'),
width=6.5,
height=4.5,
units="in")
## T2 Staff Cases
p.staff.cases <- ggplot(dcps[dcps$year==2021,],
aes(x=date, y=cases.staff)) +
geom_line(, color='green4') + geom_point() +
ylab("Daily cases") +
theme_minimal()+
labs(
title="DCPS Reported Staff Cases",
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data"
)
p.staff.cases
ggsave(here('figures','cases.staff.png'),
width=6.5,
height=4.5,
units="in")
## T2 Staff Quarantines
p.staff.quarantine <- ggplot(dcps[dcps$year==2021,], aes(x=date, y=quarantine.staff)) +
geom_line(, color='green4', linetype='dashed') +
geom_point() +
ylab("Staff in quarantine") +
theme_minimal()+
labs(
title="DCPS Reported Staff Currently in Quarantine",
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data"
)
p.staff.quarantine
ggsave(here('figures','quarantine.staff.png'),
width=6.5,
height=4.5,
units="in")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment