Skip to content

Instantly share code, notes, and snippets.

@thoughtfulbloke
Created September 15, 2019 02:26
Show Gist options
  • Save thoughtfulbloke/720083f2c49a2aab527085c1d25d7ecb to your computer and use it in GitHub Desktop.
Save thoughtfulbloke/720083f2c49a2aab527085c1d25d7ecb to your computer and use it in GitHub Desktop.
Code for converting the tables in the ESR measles outbreak weekly report pdfs into excel files
library(rvest)
library(janitor)
library(pdftools)
library(dplyr)
library(tabulizer) #needs Jav, see https://github.com/ropensci/tabulizer
library(writexl)
library(purrr)
## store pdfs locally in folder ESR_measles
if(!dir.exists("ESR_measles")){dir.create("ESR_measles")}
start_URL <- "https://surv.esr.cri.nz/surveillance/WeeklyMeaslesRpt.php"
start_page <- read_html(start_URL)
all_start_URLS <- start_page %>% html_nodes("a") %>% html_attr("href")
all_start_text <- start_page %>% html_nodes("a") %>% html_text()
stage2_urls <- paste0("https://surv.esr.cri.nz",
all_start_URLS[grep("Measles Report 2019",
all_start_text)])
get_second_links <- function(x){
every_urls <- read_html(x) %>% html_nodes("a") %>% html_attr("href")
base_url <- unique(grep("/PDF_surveillance/MeaslesRpt/2019/", every_urls, value=TRUE))
pdf_url <- paste0("https://surv.esr.cri.nz", base_url)
return(pdf_url)
}
second_link <- sapply(stage2_urls, get_second_links)
save_path <- paste0("ESR_measles/",
basename(second_link))
get_reports <- function(x,y){
if(!file.exists(y)){
download.file(url = x, destfile = y, method = "libcurl", mode = "wb")
return("new")
}
return("current")
}
downloads <- mapply(get_reports, x=second_link, y=save_path)
###
data_from_pdf <- function(x){
PDFextracted <- extract_tables(x) #tabulizer
# 1 character matrix per table, want dataframes for writexl
PDFtables <- lapply(PDFextracted,as.data.frame, stringsAsFactors=FALSE)
target_no_suffix <- gsub("\\.pdf$","", x, ignore.case = TRUE)
write_xlsx(PDFtables, paste0(target_no_suffix, ".xlsx"), col_names=FALSE) #writexl
}
PDFtext <- walk(save_path, data_from_pdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment