Last active
September 21, 2021 04:31
-
-
Save andrewzeitlin/9c4dfc47d02f6e1e86e8866b491bb7d7 to your computer and use it in GitHub Desktop.
DCPS covid cases
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrape and visualize COVID case data from DCPS website. | |
library(here) | |
library(rvest) | |
library(dplyr) | |
library(stringr) | |
library(lubridate) | |
library(ggplot2) | |
library(directlabels) | |
url <- "https://coronavirus.dc.gov/node/1506966" | |
# List of dates | |
dates <- read_html(url) %>% | |
html_nodes("strong") %>% | |
html_text() | |
dates <- dates[!str_detect(dates, fixed("DCPS", ignore_case=TRUE))] # Clean unwanted headers from the dates list | |
# Statistics re those dates | |
stats <- read_html(url) %>% | |
html_nodes('#node-page-1506966 li') %>% | |
html_text() | |
stats.cases.staff <- stats[str_detect(stats, fixed("personnel", ignore_case=TRUE)) & | |
str_detect(stats, fixed("positive", ignore_case=TRUE)) & | |
!str_detect(stats, fixed("pending", ignore_case=TRUE)) | |
] | |
stats.quarantine.staff <- stats[str_detect(stats, fixed("personnel", ignore_case=TRUE)) & str_detect(stats, fixed("quarantine", ignore_case=TRUE))] | |
stats.cases.students <- stats[str_detect(stats, fixed("students", ignore_case=TRUE)) & str_detect(stats, fixed("positive", ignore_case=TRUE)) & !str_detect(stats, fixed("pending", ignore_case=TRUE))] | |
stats.quarantine.students <- stats[str_detect(stats, fixed("students", ignore_case=TRUE)) & str_detect(stats, fixed("quarantine", ignore_case=TRUE))] | |
# Confirm that resulting data are rectangular | |
lengths <- c( | |
length(dates), | |
length(stats.cases.staff), | |
length(stats.quarantine.staff), | |
length(stats.cases.students), | |
length(stats.quarantine.students) | |
) | |
print(lengths) | |
dcps <- data.frame(dates, stats.cases.staff, stats.quarantine.staff, stats.cases.students, stats.quarantine.students) %>% | |
rename(date=dates, | |
cases.staff=stats.cases.staff, | |
cases.students=stats.cases.students, | |
quarantine.staff=stats.quarantine.staff, | |
quarantine.students=stats.quarantine.students | |
) | |
# Clean variables | |
dcps$date <- mdy(dcps$date) | |
dcps$cases.staff <- gsub("[^0-9.-]", "", dcps$cases.staff) %>% as.numeric() | |
dcps$cases.students <- gsub("[^0-9.-]", "", dcps$cases.students) %>% as.numeric() | |
dcps$quarantine.staff <- gsub("[^0-9.-]", "", dcps$quarantine.staff) %>% as.numeric() | |
dcps$quarantine.students <- gsub("[^0-9.-]", "", dcps$quarantine.students) %>% as.numeric() | |
# Derived variables | |
## Dates of term | |
start.21.t1 <- mdy("August 30, 2021") # Start of 2021-22 T1 | |
start.20.t4 <- mdy("April 19, 2021") # Start of 2020-21 T4 | |
start.20.t3 <- mdy("February 1, 2021") # Start of 2020-21-T3 | |
start.20.t2 <- mdy("November 9, 2020") # Start of 2020-21-T2 | |
start.20.t1 <- mdy("August 31, 2020") # Start of 2020-21-T1 | |
## School year | |
dcps$year[dcps$date >= start.21.t1] <- 2021 | |
dcps$year[dcps$date < start.21.t1] <- 2020 | |
## School term | |
dcps$term[dcps$date >= start.21.t1 ] <- 5 | |
dcps$term[dcps$date>= start.20.t4 & dcps$date < start.21.t1] <- 4 | |
dcps$term[dcps$date>=start.20.t3 & dcps$date < start.20.t4] <- 3 | |
dcps$term[dcps$date>=start.20.t2 & dcps$date < start.20.t3] <- 2 | |
dcps$term[dcps$date>=start.20.t1 & dcps$date < start.20.t2] <- 1 | |
## Dates relative to start of term | |
dcps$t[dcps$year==2021 & dcps$term==5] <- dcps$date[dcps$year==2021 & dcps$term==5] - start.21.t1 | |
dcps$t[dcps$year==2020 & dcps$term==4] <- dcps$date[dcps$year==2020 & dcps$term==4] - start.20.t4 | |
dcps$t[dcps$year==2020 & dcps$term==3] <- dcps$date[dcps$year==2020 & dcps$term==3] - start.20.t3 | |
dcps$t[dcps$year==2020 & dcps$term==2] <- dcps$date[dcps$year==2020 & dcps$term==2] - start.20.t2 | |
dcps$t[dcps$year==2020 & dcps$term==1] <- dcps$date[dcps$year==2020 & dcps$term==1] - start.20.t1 | |
## Cumulative cases from start of term | |
dcps <- dcps %>% group_by(term) %>% | |
mutate( | |
starting.cases.students = min(cases.students, na.rm=T) | |
) | |
# If lag cases exceed current cases, they were reset, and current cases become cumulative. | |
dcps$cases.students[dcps$term==4] <- dcps$cases.students[dcps$term==4] - max(dcps$cases.students[dcps$term==3]) # Reset counter by subtracting T3 final count | |
dcps$cases.students[dcps$term==3] <- dcps$cases.students[dcps$term==3] - max(dcps$cases.students[dcps$term==2]) | |
# Plot cumulative cases in 2021-T1 vs 2020-T4 | |
T <- max(dcps$t[dcps$term==5]) | |
labels <- data.frame( | |
x = c(T+4, T+4), | |
y = c(max(dcps$cases.students[dcps$term==5 & dcps$t<=T])+25, max(dcps$cases.students[dcps$term==4 & dcps$t <= T])+25), | |
text = c('2021/22 Term 1', '2020/21 Term 4') | |
) | |
p <- ggplot(data=dcps[dcps$term==5 & dcps$t<=T,], aes(x=t, y=cases.students)) + | |
geom_point(, color='red') + | |
geom_line(, color='red') + | |
geom_point(data=dcps[dcps$term==4 & dcps$t <= T,], aes(x=t, y=cases.students), color='blue') + | |
geom_line(data=dcps[dcps$term==4 & dcps$t <= T,], aes(x=t, y=cases.students), linetype='dashed', color='blue') + | |
ylab('Cumulative cases') + | |
xlab('Days since start of term') + | |
labs( | |
title="Cumulative Student Cases Since Start of Term", | |
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data" | |
) + | |
theme_minimal() + | |
geom_text(data=labels, aes(x,y,label=text, hjust="inward")) | |
p | |
ggsave(here('figures','now_vs_t4.png')) | |
# Plot 2021-22 outcomes by themselves | |
## T1 Student Cases | |
p.student.cases <- ggplot(dcps[dcps$year==2021,], aes(x=date, y=cases.students)) + | |
geom_line(, color='red') + | |
geom_point() + | |
ylab("Cases") + | |
theme_minimal() + | |
labs( | |
title="DCPS Reported Student Cases", | |
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data" | |
) | |
p.student.cases | |
ggsave(here('figures','cases.students.png'), | |
width=6.5, | |
height=4.5, | |
units="in") | |
## T1 Student Quarantines | |
p.student.quarantine <- ggplot(dcps[dcps$year==2021,], | |
aes(x=date, y=quarantine.students)) + | |
geom_line(, color='red', linetype='dashed') + geom_point() + | |
ylab("Students in quarantine") + | |
theme_minimal()+ | |
labs( | |
title="DCPS Reported Students Currently in Quarantine", | |
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data" | |
) | |
p.student.quarantine | |
ggsave(here('figures','quarantine.students.png'), | |
width=6.5, | |
height=4.5, | |
units="in") | |
## T2 Staff Cases | |
p.staff.cases <- ggplot(dcps[dcps$year==2021,], | |
aes(x=date, y=cases.staff)) + | |
geom_line(, color='green4') + geom_point() + | |
ylab("Daily cases") + | |
theme_minimal()+ | |
labs( | |
title="DCPS Reported Staff Cases", | |
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data" | |
) | |
p.staff.cases | |
ggsave(here('figures','cases.staff.png'), | |
width=6.5, | |
height=4.5, | |
units="in") | |
## T2 Staff Quarantines | |
p.staff.quarantine <- ggplot(dcps[dcps$year==2021,], aes(x=date, y=quarantine.staff)) + | |
geom_line(, color='green4', linetype='dashed') + | |
geom_point() + | |
ylab("Staff in quarantine") + | |
theme_minimal()+ | |
labs( | |
title="DCPS Reported Staff Currently in Quarantine", | |
caption="Source: https://coronavirus.dc.gov/page/dc-public-schools-dcps-data" | |
) | |
p.staff.quarantine | |
ggsave(here('figures','quarantine.staff.png'), | |
width=6.5, | |
height=4.5, | |
units="in") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment