Skip to content

Instantly share code, notes, and snippets.

@iangow
Created December 17, 2023 21:04
Show Gist options
  • Save iangow/a69b3e7a0a704a57f60965cad42546f0 to your computer and use it in GitHub Desktop.
Save iangow/a69b3e7a0a704a57f60965cad42546f0 to your computer and use it in GitHub Desktop.
Code to scrape data on Newton streets from a PDF.
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(readr) # For read_lines(), read_fwf(), etc.
library(stringr) # For str_c(), str_detect()
library(pdftools) # For pdf_text()
library(ggplot2)
url <- "https://www.newtonma.gov/home/showpublisheddocument/97990/638140435866000000"
col_names <- c("street_name", "length_mi", "length_ft",
"avg_width", "sq_yds", "avg_pci")
skip_rows <- 2
street_data_raw <-
pdf_text(url) |>
read_lines(skip = skip_rows) |>
tibble(temp = _)
regex <- str_c("^\\s*",
"([A-Z\\s0-9]+)\\s{2,}",
"([0-9.]+)\\s+",
"([0-9.,]+)\\s+",
"([0-9]+)\\s+",
"([0-9.]+)\\s+",
"([0-9.]+)")
street_data <-
street_data_raw |>
# mutate(id = row_number()) |>
filter(!str_detect(temp, "^$"),
!str_detect(temp, "^Street Name")) |>
# Here we use the regular expression to split the data into columns
extract(temp, col_names, regex) |>
mutate(street_name = trimws(street_name)) |>
mutate(length_ft = str_replace(length_ft, ",", "")) |>
mutate(across(length_mi:avg_pci, as.numeric))
street_data |>
arrange(desc(length_mi)) |>
print(n = 25)
street_data |>
ggplot(aes(x = log(length_ft), y = avg_pci)) +
geom_point()
street_data |>
ggplot(aes(x = log(length_ft))) +
geom_histogram()
street_data |>
arrange(length_ft)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment