Skip to content

Instantly share code, notes, and snippets.

@MattSandy
Last active October 29, 2020 02:00
Show Gist options
  • Select an option

  • Save MattSandy/b4c09fb40c841bf4aa09a8790066aff7 to your computer and use it in GitHub Desktop.

Select an option

Save MattSandy/b4c09fb40c841bf4aa09a8790066aff7 to your computer and use it in GitHub Desktop.
Extracts Tables from PDF and puts them into a dataframe
library(tidyverse)
library(pdftools)
week <- 42
mn_counties <- c("Aitkin","Anoka","Becker","Beltrami","Benton","Big Stone","Blue Earth","Brown","Carlton","Carver","Cass","Chippewa","Chisago","Clay","Clearwater","Cook","Cottonwood","Crow Wing","Dakota","Dodge","Douglas","Faribault","Fillmore","Freeborn","Goodhue","Grant","Hennepin","Houston","Hubbard","Isanti","Itasca","Jackson","Kanabec","Kandiyohi","Kittson","Koochiching","Lac qui Parle","Lake","Lake of the Woods","Le Sueur","Lincoln","Lyon","McLeod","Mahnomen","Marshall","Martin","Meeker","Mille Lacs","Morrison","Mower","Murray","Nicollet","Nobles","Norman","Olmsted","Otter Tail","Pennington","Pine","Pipestone","Polk","Pope","Ramsey","Red Lake","Redwood","Renville","Rice","Rock","Roseau","Saint Louis","Scott","Sherburne","Sibley","Stearns","Steele","Stevens","Swift","Todd","Traverse","Wabasha","Wadena","Waseca","Washington","Watonwan","Wilkin","Winona","Wright","Yellow Medicine","Unknown/missing")
location <- paste0("https://www.health.state.mn.us/diseases/coronavirus/stats/covidweekly",week,".pdf")
# Extract the table
df <- pdftools::pdf_data(location)
matched <- df[[which((df %>% lapply(function(pdf_table){
sum(grepl(mn_counties[1],pdf_table$text)) %>% return
}) %>% unlist) > 0)[1]]]$text
# Convert to a long character string
collapsed <- matched %>% paste(collapse=" ")
# Fix for St. Louis County
collapsed <- collapsed %>% str_replace("St. Louis","Saint Louis")
# Create Dataframe
results <- data.frame(county = NA,tests = NA, rate = NA)[0, ]
# Loop through counties and add rows
for(county in mn_counties) {
# If the first row
if(county==mn_counties[1]) {
match <- collapsed %>% str_match(paste0('([ ])(',county, ') ([0-9,]+) ([0-9,]+)?'))
} else {
match <- collapsed %>% str_match(paste0('([0-9]+ )(',county, ') ([0-9,]+) ([0-9,]+)?'))
}
# Display
print(c(county,match[4:5]))
# Add row
results[nrow(results)+1,] <- county %>% append(match[4:5])
}
results %>% write_csv(paste0("results_week_",week,".csv"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment