Created
June 27, 2017 13:27
-
-
Save dggoldst/6de5345d19b2891408e200a450f6f317 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(ggrepel) | |
library(scales) | |
setwd("C:/Dropbox/Projects/20170605_Population_Density") | |
#source https://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/GCTPH1.US05PR | |
df <- read_csv("DEC_10_SF1_GCTPH1.US05PR.csv", skip = 1) | |
#give human readable column headers | |
names(df) = c( | |
"id", | |
"state", | |
"country", | |
"geo_id", | |
"geo_id_suffix", | |
"geographic_area", | |
"county_name", | |
"population", | |
"housing_units", | |
"total_area", | |
"water_area", | |
"land_area", | |
"density_population_sqmi_land", | |
"density_housing_units_sqmi_land" | |
) | |
#drop puerto rico and DC. sorry guys! | |
df = df %>% | |
filter(geo_id != "0400000US72") %>% | |
filter(geo_id != "0500000US11001") %>% | |
filter(geo_id != "0400000US11") | |
#make a state data frame with just four facts for each state (for later joining) | |
sdf = df %>% | |
filter(!is.na(geo_id_suffix)) %>% | |
filter(stringr::str_length(geo_id_suffix) < 5) %>% #states have short geoids | |
mutate( | |
state = stringr::str_sub(geo_id_suffix, 1, 2), | |
geographic_area = stringr::str_sub(geographic_area, 16, stringr::str_length(geographic_area)) | |
) %>% | |
select(state, | |
geographic_area, | |
population, | |
density_population_sqmi_land) | |
names(sdf) = c("state", "geographic_area", "state_pop", "state_density") | |
#clean up county data, dropping irrelevant cols | |
df = df %>% | |
filter(!is.na(geo_id_suffix)) %>% | |
filter(stringr::str_length(geo_id_suffix) == 5) %>% #counties have geoids of length 5 | |
mutate(state = stringr::str_sub(geo_id_suffix, 1, 2)) %>% | |
select( #drop unneeded columns | |
-id,-country,-geo_id,-housing_units,-total_area, | |
-water_area,-density_housing_units_sqmi_land) | |
#join the state data with the county data | |
result = left_join(df, sdf, by = "state") %>% | |
group_by(state) %>% | |
summarise(weighted_density = round(sum( | |
population / state_pop * density_population_sqmi_land | |
), 0)) %>% | |
ungroup() %>% | |
left_join(sdf, .) %>% | |
arrange(-weighted_density) %>% | |
#mark states with weighted density 10x higher than unweighted density | |
mutate(highlight = weighted_density / state_density > 10) | |
#save clean data for posterity | |
write_csv(result, "result.csv") | |
#Make the scatterplot, Schulte style | |
p = ggplot(result, | |
aes(x = state_density, y = weighted_density, color = highlight)) + | |
theme_bw() + | |
scale_x_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000), | |
label = comma) + | |
scale_y_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000), | |
label = comma) + | |
geom_point() + | |
geom_text_repel(aes(label = geographic_area)) + | |
geom_abline(slope = 1) + | |
theme(legend.position = "none") + | |
labs(x = "Unweighted Population Density", y = "Weighted Population Density") | |
p | |
#ggsave(plot = p, file = "unweighted_v_weighted_density.png", height = 8, width = 8) | |
#make a long version of result with two rows per state | |
result_l = result %>% | |
mutate(sortval = weighted_density) %>% | |
gather(measure, density, state_density:weighted_density) %>% | |
arrange(sortval, measure) %>% | |
mutate(measure = factor(measure, levels = c("weighted_density", "state_density"))) | |
# make the plot in which the rows are states sorted by weighted density | |
p = ggplot(result_l, aes(x = density, y = reorder(geographic_area, sortval), color = measure)) + | |
theme_bw() + | |
geom_point(size = 3) + | |
#connect the two measures for each state with a line | |
geom_line(aes(group = geographic_area), color = "black") + | |
scale_x_log10(breaks = c(10, 30, 100, 300, 1000, 3000, 10000), | |
label = comma) + | |
theme(legend.position = "bottom") + | |
labs(x = "Population density", y = "States ranked by weighted population density") + | |
scale_color_discrete( name = "", | |
breaks = c("weighted_density", "state_density"), | |
labels = c("Weighted Population Density", "Unweighted Population Density")) | |
p | |
#ggsave(plot = p, file = "state_v_unweighted_and_weighted_density.png", height = 8, width = 6) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment