dggoldst · June 27, 2017 13:27
diff --git a/weighted_population_density.R b/weighted_population_density.R
 library(tidyverse)
 library(ggrepel)
 library(scales)

 setwd("C:/Dropbox/Projects/20170605_Population_Density")

 #source https://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/GCTPH1.US05PR
 df <- read_csv("DEC_10_SF1_GCTPH1.US05PR.csv", skip = 1)

 #give human readable column headers
 names(df) = c(
  "id",
  "state",
  "country",
  "geo_id",
  "geo_id_suffix",
  "geographic_area",
  "county_name",
  "population",
  "housing_units",
  "total_area",
  "water_area",
  "land_area",
  "density_population_sqmi_land",
  "density_housing_units_sqmi_land"
 )
 #drop puerto rico and DC. sorry guys!
 df = df %>%
  filter(geo_id != "0400000US72") %>%
  filter(geo_id != "0500000US11001") %>%
  filter(geo_id != "0400000US11")

 #make a state data frame with just four facts for each state (for later joining)
 sdf = df %>%
  filter(!is.na(geo_id_suffix)) %>%
  filter(stringr::str_length(geo_id_suffix) < 5) %>% #states have short geoids
  mutate(
    state = stringr::str_sub(geo_id_suffix, 1, 2),
    geographic_area = stringr::str_sub(geographic_area, 16, stringr::str_length(geographic_area))
  ) %>%
  select(state,
         geographic_area,
         population,
         density_population_sqmi_land)
 names(sdf) = c("state", "geographic_area", "state_pop", "state_density")

 #clean up county data, dropping irrelevant cols
 df = df %>%
  filter(!is.na(geo_id_suffix)) %>%
  filter(stringr::str_length(geo_id_suffix) == 5) %>% #counties have geoids of length 5
  mutate(state = stringr::str_sub(geo_id_suffix, 1, 2)) %>%
  select( #drop unneeded columns
    -id,-country,-geo_id,-housing_units,-total_area,
    -water_area,-density_housing_units_sqmi_land)

 #join the state data with the county data
 result = left_join(df, sdf, by = "state") %>%
  group_by(state) %>%
  summarise(weighted_density = round(sum(
    population / state_pop * density_population_sqmi_land
  ), 0)) %>%
  ungroup() %>%
  left_join(sdf, .) %>%
  arrange(-weighted_density) %>%
  #mark states with weighted density 10x higher than unweighted density
  mutate(highlight = weighted_density / state_density > 10)

 #save clean data for posterity
 write_csv(result, "result.csv")

 #Make the scatterplot, Schulte style
 p = ggplot(result,
           aes(x = state_density, y = weighted_density, color = highlight)) +
  theme_bw() +
  scale_x_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000),
                label = comma) +
  scale_y_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000),
                label = comma) +
  geom_point() +
  geom_text_repel(aes(label = geographic_area)) +
  geom_abline(slope = 1) +
  theme(legend.position = "none") +
  labs(x = "Unweighted Population Density", y = "Weighted Population Density")
 p
 #ggsave(plot = p, file = "unweighted_v_weighted_density.png", height = 8, width = 8)

 #make a long version of result with two rows per state
 result_l = result %>%
  mutate(sortval = weighted_density) %>%
  gather(measure, density, state_density:weighted_density) %>%
  arrange(sortval, measure) %>%
  mutate(measure = factor(measure, levels = c("weighted_density", "state_density")))

 # make the plot in which the rows are states sorted by weighted density
 p = ggplot(result_l, aes(x = density, y = reorder(geographic_area, sortval), color = measure)) +
  theme_bw() +
  geom_point(size = 3) +
  #connect the two measures for each state with a line
  geom_line(aes(group = geographic_area), color = "black") +
  scale_x_log10(breaks = c(10, 30, 100, 300, 1000, 3000, 10000),
                label = comma) +
  theme(legend.position = "bottom") +
  labs(x = "Population density", y = "States ranked by weighted population density") +
  scale_color_discrete( name = "",
                        breaks = c("weighted_density", "state_density"),
                        labels = c("Weighted Population Density", "Unweighted Population Density"))
 p
 #ggsave(plot = p, file = "state_v_unweighted_and_weighted_density.png", height = 8, width = 6)
	library(tidyverse)
	library(ggrepel)
	library(scales)

	setwd("C:/Dropbox/Projects/20170605_Population_Density")

	#source https://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/GCTPH1.US05PR
	df <- read_csv("DEC_10_SF1_GCTPH1.US05PR.csv", skip = 1)

	#give human readable column headers
	names(df) = c(
	"id",
	"state",
	"country",
	"geo_id",
	"geo_id_suffix",
	"geographic_area",
	"county_name",
	"population",
	"housing_units",
	"total_area",
	"water_area",
	"land_area",
	"density_population_sqmi_land",
	"density_housing_units_sqmi_land"
	)
	#drop puerto rico and DC. sorry guys!
	df = df %>%
	filter(geo_id != "0400000US72") %>%
	filter(geo_id != "0500000US11001") %>%
	filter(geo_id != "0400000US11")

	#make a state data frame with just four facts for each state (for later joining)
	sdf = df %>%
	filter(!is.na(geo_id_suffix)) %>%
	filter(stringr::str_length(geo_id_suffix) < 5) %>% #states have short geoids
	mutate(
	state = stringr::str_sub(geo_id_suffix, 1, 2),
	geographic_area = stringr::str_sub(geographic_area, 16, stringr::str_length(geographic_area))
	) %>%
	select(state,
	geographic_area,
	population,
	density_population_sqmi_land)
	names(sdf) = c("state", "geographic_area", "state_pop", "state_density")

	#clean up county data, dropping irrelevant cols
	df = df %>%
	filter(!is.na(geo_id_suffix)) %>%
	filter(stringr::str_length(geo_id_suffix) == 5) %>% #counties have geoids of length 5
	mutate(state = stringr::str_sub(geo_id_suffix, 1, 2)) %>%
	select( #drop unneeded columns
	-id,-country,-geo_id,-housing_units,-total_area,
	-water_area,-density_housing_units_sqmi_land)

	#join the state data with the county data
	result = left_join(df, sdf, by = "state") %>%
	group_by(state) %>%
	summarise(weighted_density = round(sum(
	population / state_pop * density_population_sqmi_land
	), 0)) %>%
	ungroup() %>%
	left_join(sdf, .) %>%
	arrange(-weighted_density) %>%
	#mark states with weighted density 10x higher than unweighted density
	mutate(highlight = weighted_density / state_density > 10)

	#save clean data for posterity
	write_csv(result, "result.csv")

	#Make the scatterplot, Schulte style
	p = ggplot(result,
	aes(x = state_density, y = weighted_density, color = highlight)) +
	theme_bw() +
	scale_x_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000),
	label = comma) +
	scale_y_log10(breaks = c(1, 3, 10, 30, 100, 300, 1000, 3000, 10000),
	label = comma) +
	geom_point() +
	geom_text_repel(aes(label = geographic_area)) +
	geom_abline(slope = 1) +
	theme(legend.position = "none") +
	labs(x = "Unweighted Population Density", y = "Weighted Population Density")
	p
	#ggsave(plot = p, file = "unweighted_v_weighted_density.png", height = 8, width = 8)

	#make a long version of result with two rows per state
	result_l = result %>%
	mutate(sortval = weighted_density) %>%
	gather(measure, density, state_density:weighted_density) %>%
	arrange(sortval, measure) %>%
	mutate(measure = factor(measure, levels = c("weighted_density", "state_density")))

	# make the plot in which the rows are states sorted by weighted density
	p = ggplot(result_l, aes(x = density, y = reorder(geographic_area, sortval), color = measure)) +
	theme_bw() +
	geom_point(size = 3) +
	#connect the two measures for each state with a line
	geom_line(aes(group = geographic_area), color = "black") +
	scale_x_log10(breaks = c(10, 30, 100, 300, 1000, 3000, 10000),
	label = comma) +
	theme(legend.position = "bottom") +
	labs(x = "Population density", y = "States ranked by weighted population density") +
	scale_color_discrete( name = "",
	breaks = c("weighted_density", "state_density"),
	labels = c("Weighted Population Density", "Unweighted Population Density"))
	p
	#ggsave(plot = p, file = "state_v_unweighted_and_weighted_density.png", height = 8, width = 6)