Skip to content

Instantly share code, notes, and snippets.

@shuckle16
Created March 3, 2022 13:16
Show Gist options
  • Save shuckle16/6d178449b7dd946ff4e2ab6190928ea8 to your computer and use it in GitHub Desktop.
Save shuckle16/6d178449b7dd946ff4e2ab6190928ea8 to your computer and use it in GitHub Desktop.
wikipedia scraper for population of TN cities
library(dplyr)
library(foreach)
library(ggplot2)
library(ggrepel)
library(glue)
library(janitor)
library(rvest)
library(doParallel)
registerDoParallel(3)
cities <- c(
"Knoxville", "Nashville", "Memphis",
"Chattanooga", "Clarksville", "Murfreesboro",
"Jackson", "Franklin"
)
get_city_population_data <- function(city) {
tn <- read_html(glue("https://en.wikipedia.org/wiki/{city},_Tennessee"))
tn_pop <-
tn %>%
html_nodes(".toccolours") %>%
html_table(header = T) %>%
`[[`(1) %>%
janitor::clean_names()
names(tn_pop) <- tn_pop %>% slice(1) %>% as.character()
tn_pop <-
tn_pop %>%
janitor::clean_names() %>%
slice(-c(1, nrow(tn_pop))) %>%
mutate(
pop = gsub(",", "", pop) %>% as.numeric(),
census = census %>% as.numeric()
) %>%
dplyr::select(census, pop)
tn_pop
}
pops <-
foreach(i = seq_along(cities), .combine = rbind) %dopar% {
get_city_population_data(cities[i]) %>% mutate(city = cities[i])
}
pops %>%
mutate(label = if_else(census == (max(census) - 20), as.character(city), NA_character_)) %>%
ggplot(aes(x = as.numeric(census), y = pop, color = city)) +
geom_point() +
geom_line() +
geom_label_repel(aes(label = label), force = 25, na.rm = TRUE, min.segment.length = 0) +
ggtitle("Population of major cities in Tennessee over time") +
scale_y_continuous(labels = scales::comma) +
xlab("Census Year") +
ylab("Population") +
scale_color_discrete(guide = FALSE) +
theme(text = element_text(size = 18))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment