Created
December 13, 2020 17:38
-
-
Save bmschmidt/818962befd5c592f4e4e1de502a8ae41 to your computer and use it in GitHub Desktop.
Why does the New York Times hate colleges?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "R Notebook" | |
output: html_notebook | |
--- | |
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. | |
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. | |
```{r} | |
library(tidycensus) | |
library(tidyverse) | |
library(sf) | |
options(tigris_use_cache = TRUE) | |
v18 <- load_variables(2018, "acs5", cache = TRUE) | |
v18 %>% filter(concept == "SCHOOL ENROLLMENT BY LEVEL OF SCHOOL FOR THE POPULATION 3 YEARS AND OVER") | |
# high school, college, grad school, total. | |
cats = c("B14001_007", "B14001_008", "B14001_009", "B01001_001") | |
g = get_acs("county", cats, geometry = TRUE, cache_table = TRUE) | |
# ?get_acs | |
shares = g %>% st_set_geometry(NULL) %>% select(-moe) %>% | |
pivot_wider(names_from = "variable", values_from="estimate") %>% | |
mutate(share = (B14001_008 + B14001_009) / B01001_001) %>% | |
mutate(students = B14001_008 + B14001_009, undergrads = B14001_008) %>% | |
select(NAME, FIPS = GEOID, share, students, undergrads, total = B01001_001) %>% | |
mutate(category = ifelse(share > .1, "student county", "non-student county") ) | |
#wider = g %>% | |
# distinct(GEOID) %>% | |
# inner_join(shares) %>% | |
# ggplot() + geom_sf(aes(fill=B14001_001)) | |
# wider + scale_fill_viridis_c(trans="log") | |
shares %>% | |
arrange(-total) %>% | |
mutate(r = 1:n(), cumulative = cumsum(total))%>% | |
ggplot() + geom_line(aes(x=r, y = cumulative)) | |
shares %>% write_csv("ACS.csv") | |
confirmed_raw = read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') | |
deaths_raw = read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv') | |
deparse = . %>% mutate(FIPS = str_pad(FIPS, 5, "left", "0")) %>% | |
pivot_longer(cols = matches("[0-9]/"), names_to = "date", values_to = "count") %>% | |
mutate(date = lubridate::parse_date_time(date, orders = "m/d/y")) %>% mutate(new = count - lag(count, 1)) | |
confirmed = confirmed_raw %>% deparse %>% mutate(variable = "confirmed") | |
deaths = deaths_raw %>% deparse %>% mutate(variable = "deaths") | |
together = confirmed %>% bind_rows(deaths) %>% inner_join(shares) | |
together %>% filter(variable=="deaths") %>% filter(new > 0) %>% | |
group_by(category, date, variable) %>% summarize(count = sum(new)/sum(total)) %>% | |
ggplot() + geom_line(aes(x = date, y = count, color=category)) + labs(title = "Death Rate from COVID-19 by student population") + scale_color_brewer(palette = 2, type="qual") + theme_bw() + facet_wrap(~variable, scales = "free_y") + labs(caption = "Johns Hopkins data") | |
weekly = together %>% filter(date > lubridate::ymd("2020-03-15")) %>% mutate(week = lubridate::round_date(date, unit = "week")) %>% | |
group_by(FIPS, variable, Combined_Key, week, students, share, category, total) %>% | |
summarize(new = sum(new)) | |
weekly %>% group_by(week, variable) %>% summarize(new = sum(new)) %>% | |
ggplot() + geom_line(aes(x = week, y = new, color = variable)) + | |
labs(title = "Weekly counts. Sanity test on data.") | |
``` | |
```{r} | |
populations = weekly %>% ungroup %>% distinct(FIPS, category, total) | |
pops = populations %>% count(category, wt = total, name = "total") | |
weekly %>% | |
group_by(category, week, variable) %>% | |
summarize(count = sum(new)) %>% | |
inner_join(pops) %>% | |
ggplot() + geom_line(aes(x = week, y = count/total, color = category)) + | |
facet_wrap(~variable, scales = "free_y") | |
``` | |
```{r} | |
overall_rates = weekly %>% | |
group_by(variable, total) %>% | |
summarize(total = sum(total)) | |
``` | |
```{r} | |
weekly %>% ungroup %>% | |
group_by(week, total, variable, FIPS, category) %>% | |
summarize(new = sum(new)) %>% | |
group_by(week, variable, category) %>% | |
summarize(total = sum(total), new = sum(new)) %>% | |
mutate(ratio = new/total) %>% | |
select(-new, -total) %>% | |
pivot_wider(names_from = "category", values_from = "ratio") %>% | |
ggplot() + geom_line(aes(x = week, y = `student county`/`non-student county`, color = variable)) + | |
theme_bw() + labs(title = "Student county rates as share of non-student-county rates.") | |
``` | |
```{r} | |
# Counties with low death rates before August 30. | |
library(lubridate) | |
weekly_ranks = weekly %>% ungroup %>% filter(week < ymd("2020-08-30")) %>% group_by(FIPS) %>% | |
filter(variable=="deaths") %>% summarize(deaths = sum(new)) %>% inner_join(populations) %>% | |
mutate(ratio = deaths/total) %>% arrange(ratio) %>% | |
mutate(death_rank_pre_september = 1:n()) %>% select(total, FIPS, category, death_rank_pre_september) | |
weekly | |
cat2 = weekly_ranks %>% | |
mutate(pre_sept_class = ifelse(death_rank_pre_september < 1500, "low", "high")) %>% | |
select(-death_rank_pre_september) | |
weekly %>% ungroup %>% | |
left_join(cat2) %>% | |
mutate(category = pre_sept_class) %>% | |
group_by(week, total, variable, FIPS, category) %>% | |
summarize(new = sum(new)) %>% | |
group_by(week, variable, category) %>% | |
summarize(total = sum(total), new = sum(new)) %>% | |
mutate(ratio = new/total) %>% | |
select(-new, -total) %>% | |
pivot_wider(names_from = "category", values_from = "ratio") %>% | |
ggplot() + geom_line(aes(x = week, y = `low`/`high`, color = variable)) + | |
theme_bw() + labs(title = "The 1500 counties with low death rates before August \ngenerally reverted back up over the mean ") + scale_y_continuous("rate relative to high-incidence counties", labels = scales::percent) | |
``` | |
```{r} | |
weekly %>% inner_join(tibble(week = lubridate::ymd("2020-08-02", "2020-12-06"))) | |
weekly %>% | |
pivot_wider(names_from = "week", values_from = "new") %>% | |
filter(variable == "confirmed") %>% | |
mutate(delta = `2020-12-06`/`2020-08-02`) %>% | |
filter(delta > 0, `2020-08-02` > 5, `2020-12-06` > 5) %>% | |
arrange(-total) %>% | |
ggplot() + aes(x = share, y = delta) + geom_point(alpha = 0.1) + geom_text(aes(label = Combined_Key %>% str_replace(", US", "")), check_overlap = TRUE) + scale_y_continuous("Change in Coronavirus cases, Week of 8/02 to week of 12-06", trans="log10") + scale_x_continuous("Student Population share", trans="sqrt", labels = scales::percent) + geom_smooth() + theme_bw() + labs(title = "Is there a relationship between student population and corona rate?", subtitle = "Not that I can see") | |
``` | |
```{r} | |
weekly %>% | |
pivot_wider(names_from = "week", values_from = "new") %>% | |
filter(variable == "deaths") %>% | |
mutate(delta = `2020-12-06`/`2020-08-30`) %>% | |
filter(delta > 0, total > 3000) %>% | |
arrange(-total) %>% | |
ggplot() + aes(x = share, y = delta) + geom_point(alpha = 0.1) + geom_text(aes(label = Combined_Key %>% str_replace(", US", "")), check_overlap = TRUE) + scale_y_continuous("Change in Coronavirus deaths, Week of 8/02 to week of 12-06", trans="log10") + scale_x_continuous("Student Population share", trans="sqrt", labels = scales::percent) + geom_smooth() + theme_bw() + labs(title = "Is there a relationship between student population and corona death rate?", subtitle = "Not that I can see") | |
``` | |
```{r} | |
weekly %>% | |
pivot_wider(names_from = "week", values_from = "new") %>% | |
filter(variable == "deaths") %>% | |
filter(Combined_Key %>% str_detect("Wisconsin")) %>% | |
mutate(delta = `2020-12-06`/`2020-08-30`) %>% | |
arrange(-total) %>% | |
ggplot() + aes(x = total, y = delta, color=category) + geom_point(alpha = 0.1) + geom_text(aes(label = Combined_Key %>% str_replace(", US", "")), check_overlap = TRUE) + scale_y_continuous("Change in Coronavirus deaths, Week of 8/02 to week of 12-06", trans="log10") + scale_x_continuous("pop", trans="log10", labels = scales::percent)+ theme_bw() + labs(title = "Is there a relationship between student population and corona death rate?", subtitle = "Not that I can see") | |
``` | |
``` | |
```{r} | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment