Created
September 15, 2019 02:26
-
-
Save thoughtfulbloke/720083f2c49a2aab527085c1d25d7ecb to your computer and use it in GitHub Desktop.
Code for converting the tables in the ESR measles outbreak weekly report pdfs into excel files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(janitor) | |
library(pdftools) | |
library(dplyr) | |
library(tabulizer) #needs Jav, see https://github.com/ropensci/tabulizer | |
library(writexl) | |
library(purrr) | |
## store pdfs locally in folder ESR_measles | |
if(!dir.exists("ESR_measles")){dir.create("ESR_measles")} | |
start_URL <- "https://surv.esr.cri.nz/surveillance/WeeklyMeaslesRpt.php" | |
start_page <- read_html(start_URL) | |
all_start_URLS <- start_page %>% html_nodes("a") %>% html_attr("href") | |
all_start_text <- start_page %>% html_nodes("a") %>% html_text() | |
stage2_urls <- paste0("https://surv.esr.cri.nz", | |
all_start_URLS[grep("Measles Report 2019", | |
all_start_text)]) | |
get_second_links <- function(x){ | |
every_urls <- read_html(x) %>% html_nodes("a") %>% html_attr("href") | |
base_url <- unique(grep("/PDF_surveillance/MeaslesRpt/2019/", every_urls, value=TRUE)) | |
pdf_url <- paste0("https://surv.esr.cri.nz", base_url) | |
return(pdf_url) | |
} | |
second_link <- sapply(stage2_urls, get_second_links) | |
save_path <- paste0("ESR_measles/", | |
basename(second_link)) | |
get_reports <- function(x,y){ | |
if(!file.exists(y)){ | |
download.file(url = x, destfile = y, method = "libcurl", mode = "wb") | |
return("new") | |
} | |
return("current") | |
} | |
downloads <- mapply(get_reports, x=second_link, y=save_path) | |
### | |
data_from_pdf <- function(x){ | |
PDFextracted <- extract_tables(x) #tabulizer | |
# 1 character matrix per table, want dataframes for writexl | |
PDFtables <- lapply(PDFextracted,as.data.frame, stringsAsFactors=FALSE) | |
target_no_suffix <- gsub("\\.pdf$","", x, ignore.case = TRUE) | |
write_xlsx(PDFtables, paste0(target_no_suffix, ".xlsx"), col_names=FALSE) #writexl | |
} | |
PDFtext <- walk(save_path, data_from_pdf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment