Skip to content

Instantly share code, notes, and snippets.

@dggoldst
Created June 27, 2017 13:27
Show Gist options
  • Save dggoldst/6de5345d19b2891408e200a450f6f317 to your computer and use it in GitHub Desktop.
Save dggoldst/6de5345d19b2891408e200a450f6f317 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(ggrepel)
library(scales)
setwd("C:/Dropbox/Projects/20170605_Population_Density")
#source https://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/GCTPH1.US05PR
df <- read_csv("DEC_10_SF1_GCTPH1.US05PR.csv", skip = 1)
#give human readable column headers
names(df) = c(
"id",
"state",
"country",
"geo_id",
"geo_id_suffix",
"geographic_area",
"county_name",
"population",
"housing_units",
"total_area",
"water_area",
"land_area",
"density_population_sqmi_land",
"density_housing_units_sqmi_land"
)
#drop puerto rico and DC. sorry guys!
df = df %>%
filter(geo_id != "0400000US72") %>%
filter(geo_id != "0500000US11001") %>%
filter(geo_id != "0400000US11")
#make a state data frame with just four facts for each state (for later joining)
sdf = df %>%
filter(!is.na(geo_id_suffix)) %>%
filter(stringr::str_length(geo_id_suffix) < 5) %>% #states have short geoids
mutate(
state = stringr::str_sub(geo_id_suffix, 1, 2),
geographic_area = stringr::str_sub(geographic_area, 16, stringr::str_length(geographic_area))
) %>%
select(state,
geographic_area,
population,
density_population_sqmi_land)
names(sdf) = c("state", "geographic_area", "state_pop", "state_density")
#clean up county data, dropping irrelevant cols
df = df %>%
filter(!is.na(geo_id_suffix)) %>%
filter(stringr::str_length(geo_id_suffix) == 5) %>% #counties have geoids of length 5
mutate(state = stringr::str_sub(geo_id_suffix, 1, 2)) %>%
select( #drop unneeded columns
-id,-country,-geo_id,-housing_units,-total_area,
-water_area,-density_housing_units_sqmi_land)
#join the state data with the county data
result = left_join(df, sdf, by = "state") %>%
group_by(state) %>%
summarise(weighted_density = round(sum(
population / state_pop * density_population_sqmi_land
), 0)) %>%
ungroup() %>%
left_join(sdf, .) %>%
arrange(-weighted_density) %>%
#mark states with weighted density 10x higher than unweighted density
mutate(highlight = weighted_density / state_density > 10)
#save clean data for posterity
write_csv(result, "result.csv")
#Make the scatterplot, Schulte style
p = ggplot(result,
aes(x = state_density, y = weighted_density, color = highlight)) +
theme_bw() +
scale_x_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000),
label = comma) +
scale_y_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000),
label = comma) +
geom_point() +
geom_text_repel(aes(label = geographic_area)) +
geom_abline(slope = 1) +
theme(legend.position = "none") +
labs(x = "Unweighted Population Density", y = "Weighted Population Density")
p
#ggsave(plot = p, file = "unweighted_v_weighted_density.png", height = 8, width = 8)
#make a long version of result with two rows per state
result_l = result %>%
mutate(sortval = weighted_density) %>%
gather(measure, density, state_density:weighted_density) %>%
arrange(sortval, measure) %>%
mutate(measure = factor(measure, levels = c("weighted_density", "state_density")))
# make the plot in which the rows are states sorted by weighted density
p = ggplot(result_l, aes(x = density, y = reorder(geographic_area, sortval), color = measure)) +
theme_bw() +
geom_point(size = 3) +
#connect the two measures for each state with a line
geom_line(aes(group = geographic_area), color = "black") +
scale_x_log10(breaks = c(10, 30, 100, 300, 1000, 3000, 10000),
label = comma) +
theme(legend.position = "bottom") +
labs(x = "Population density", y = "States ranked by weighted population density") +
scale_color_discrete( name = "",
breaks = c("weighted_density", "state_density"),
labels = c("Weighted Population Density", "Unweighted Population Density"))
p
#ggsave(plot = p, file = "state_v_unweighted_and_weighted_density.png", height = 8, width = 6)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment