Created
August 24, 2015 02:17
-
-
Save jepusto/2825b7e2a2b49ff2e61a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyr) | |
library(dplyr) | |
library(stringr) | |
library(ggplot2) | |
cities_select <- c("HOUSTON","SAN ANTONIO","DALLAS","AUSTIN","FORT WORTH","EL PASO") | |
#---------------------------------------- | |
# get population estimates | |
#---------------------------------------- | |
# get 2000-2010 | |
pop_ests_2000 <- read.csv("http://www.census.gov/popest/data/intercensal/cities/files/SUB-EST00INT.csv", | |
stringsAsFactors = FALSE) | |
filter(pop_ests_2000, STNAME=="Texas" & SUMLEV==162 & toupper(NAME) %in% paste(cities_select, "CITY")) %>% | |
select(NAME, POPESTIMATE2000:POPESTIMATE2009, POPESTIMATE2010) %>% | |
gather("Year","Pop", POPESTIMATE2000:POPESTIMATE2010) %>% | |
mutate(City = toupper(str_sub(NAME, end = -6L)), | |
Year = as.integer(str_sub(Year, start = -4L)), | |
source = "2000-2010") %>% | |
select(-NAME) -> | |
pop_by_city_2000 | |
# get 2010-2014 | |
pop_ests_2010 <- read.csv("http://www.census.gov/popest/data/cities/totals/2014/files/SUB-EST2014_48.csv", | |
stringsAsFactors = FALSE) | |
filter(pop_ests_2010, SUMLEV==162 & toupper(NAME) %in% paste(cities_select, "CITY")) %>% | |
select(NAME, POPESTIMATE2010:POPESTIMATE2014) %>% | |
gather("Year","Pop", POPESTIMATE2010:POPESTIMATE2014) %>% | |
mutate(City = toupper(str_sub(NAME, end = -6L)), | |
Year = as.integer(str_sub(Year, start = -4L)), | |
source = "2010-2014") %>% | |
select(-NAME) -> | |
pop_by_city_2010 | |
# adjust so 2010 agrees across sources | |
filter(pop_by_city_2000, Year==2010) %>% | |
select(City, Pop_old = Pop) %>% | |
left_join(select(filter(pop_by_city_2010, Year==2010), City, Pop), by = "City") %>% | |
mutate(ratio = Pop / Pop_old) %>% | |
select(City, ratio) -> | |
pop_adj_ratio | |
left_join(pop_by_city_2000, pop_adj_ratio, by = "City") %>% | |
mutate(Pop = Pop * ratio * max((Year - 2005) / 5, 0)) %>% | |
filter(2006 <= Year, Year <= 2009) %>% | |
select(-ratio) -> | |
pop_by_city_2000_adj | |
pop_by_city <- rbind(pop_by_city_2000_adj, pop_by_city_2010) | |
# add projection for 2015 | |
lm_pop <- lm(Pop ~ 0 + City + Year:City, data = pop_by_city_2010) | |
pop2015 <- filter(pop_by_city, Year==2014) %>% mutate(Year = 2015) | |
pop2015$Pop <- predict(lm_pop, newdata = pop2015) | |
pop_by_city <- rbind(pop_by_city, pop2015) | |
ggplot(pop_by_city, aes(Year, Pop, color = City, shape = City, linetype = source)) + | |
geom_point() + geom_line() + | |
theme_minimal() | |
#--------------------------------------- | |
# prepare crash data | |
#--------------------------------------- | |
crash_dat <- read.csv("http://blogs.edb.utexas.edu/pusto/files/2015/08/Yearly_crash_data_by_city.csv", | |
stringsAsFactors=FALSE) | |
select(crash_dat, City, Year, Fatal_crashes:Incapacitating_injuries, Total_crashes) %>% | |
filter(City %in% cities_select) %>% droplevels() %>% | |
mutate(Fatality_rate = Fatal_crashes / Total_crashes) %>% | |
gather("quantity","n", Fatal_crashes:Total_crashes, Fatality_rate) %>% | |
filter(!is.na(n)) %>% | |
left_join(pop_by_city, by = c("City","Year")) %>% | |
mutate(partial = (Year==2015), | |
quantity = str_replace(quantity,"_"," "), | |
n_per = n / Pop * 100000, | |
projection = "actual") -> | |
crashes | |
#------------------------------- | |
# Crashes per 100K residents | |
#------------------------------- | |
filter(crashes, Year==2015) %>% mutate(n = 0, n_per = 0) -> | |
crashes_axis | |
filter(crashes, Year==2015) %>% | |
mutate(n = n * 12 / 7, n_per = n_per * 12/ 7, projection = "projected") -> | |
crashes_projected | |
crashes <- rbind(crashes, crashes_projected) | |
filter(crashes, quantity != "Fatality rate", | |
Year < 2015 | projection == "projected") %>% droplevels() -> | |
crashes_total | |
crash_labels <- filter(crashes_total, Year==2015) | |
crashes_per <- | |
ggplot(crashes_total, aes(Year, n_per, color = City, shape = City)) + | |
geom_line() + geom_point() + | |
geom_text(data = crash_labels, aes(x = 2015, y = n_per, label = City), hjust=-0.1, size = 3) + | |
geom_blank(data = filter(crashes_axis, quantity!="Fatality rate")) + | |
scale_color_brewer(type = "qual", palette=7) + | |
scale_x_continuous(limits = c(2006, 2016), | |
breaks = 2006:2015, labels = c(2006:2014, "2015 \n (Projected)")) + | |
facet_wrap(~ quantity, ncol = 1, scales = "free") + | |
theme_minimal() + theme(legend.position="none") + | |
labs(color = "", shape = "", y = "Crashes/Fatalities per 100,000 residents", | |
title = "Annual crashes and fatalities/injuries per 100,000 residents \n in selected Texas cities (2006-2015)") | |
arrange(crashes_total, quantity, City, Year) %>% | |
group_by(quantity, City) %>% | |
mutate(change = n_per - lag(n_per)) %>% | |
filter(!is.na(change) & quantity %in% c("Fatal crashes","Fatalities")) -> | |
crashes_change | |
summary(filter(crashes_change, quantity=="Fatal crashes")$change) | |
summary(filter(crashes_change, quantity=="Fatalities")$change) | |
ggplot(crashes_change, aes(change, color = quantity)) + geom_density() | |
#------------------------------- | |
# Total crashes | |
#------------------------------- | |
crashes_total_fig <- | |
ggplot(crashes_total, aes(Year, n, color = City, shape = City)) + | |
geom_line() + geom_point() + | |
geom_text(data = crash_labels, aes(x = 2015, y = n, label = City), hjust=-0.1, size = 3) + | |
geom_blank(data = filter(crashes_axis, quantity!="Fatality rate")) + | |
scale_color_brewer(type = "qual", palette=7) + | |
scale_x_continuous(limits = c(2006, 2016), | |
breaks = 2006:2015, labels = c(2006:2014, "2015 \n (Projected)")) + | |
facet_wrap(~ quantity, ncol = 1, scales = "free") + | |
theme_minimal() + theme(legend.position="none") + | |
labs(color = "", shape = "", y = "Total Crashes/Fatalities", | |
title = "Annual total crashes and fatalities/injuries \n in selected Texas cities (2006-2015)") | |
#------------------------------- | |
# Fatality rate figure | |
#------------------------------- | |
fatality_rate <- filter(crashes, quantity=="Fatality rate" & projection=="actual") | |
crash_labels <- filter(fatality_rate, Year==2015) | |
fatality_rates <- | |
ggplot(fatality_rate, aes(Year, n, color = City, shape = City)) + | |
geom_line() + geom_point() + | |
geom_blank(data = filter(crashes_axis, quantity=="Fatality rate")) + | |
geom_text(data = crash_labels, aes(x = 2015, y = n, label = City), hjust=-0.1, size = 3) + | |
scale_color_brewer(type = "qual", palette=7) + | |
scale_x_continuous(limits = c(2006, 2016), breaks = 2006:2015, | |
labels = c(2006:2014, "2015 \n (Through 7/31)")) + | |
theme_minimal() + theme(legend.position="none") + | |
labs(color = "", shape = "", y = "Fatal crashes / Total crashes", | |
title = "Proportion of crashes that are fatal \n in selected Texas cities (2010-2015)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment