Last active
April 17, 2018 13:13
-
-
Save christlc/c6cd2a2f3ff99b69ae2884abffd1513a to your computer and use it in GitHub Desktop.
Hong Kong Government budget extract from pdf - semi-auto solution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tabulizer) | |
#### PARAMETERS #### | |
target_str <- "An analysis of the financial provision under Subhead 000 Operational expenses is as follows" | |
filename <- "head156.pdf" | |
year <- 2012 | |
#################### | |
# locate page by key word | |
locate_page <- function(f, target_str){ | |
target_page <- NULL | |
for(n in 1:tabulizer::get_n_pages(f)){ | |
if(stringr::str_detect(extract_text(f, pages=as.numeric(n)), target_str)){ | |
target_page <- n | |
break | |
} | |
} | |
target_page | |
} | |
# spring up shiny app | |
locate_via_shiny <- function(f, target_page){ | |
column_loc_list <- list() | |
while(TRUE){ | |
loc <- locate_areas(f, pages=target_page) | |
if(is.null(loc[[1]])){ | |
break | |
}else{ | |
column_loc_list[[length(column_loc_list)+1]] <- loc[[1]] | |
} | |
} | |
column_loc_list %>% | |
rbind_list() %>% | |
summarise(top = min(top), | |
left = min(left), | |
bottom = max(bottom), | |
right = max(right) | |
) -> area_boundaries | |
columns_boundaries <- column_loc_list %>% | |
rbind_list() %>% | |
select(right) %>% unlist() | |
list(columns_boundaries = list(columns_boundaries), | |
area_boundaries = list(area_boundaries %>% unlist)) | |
} | |
# locate column(s) | |
extract_from_pdf <- function(f, target_str){ | |
target_page <- locate_page(f, target_str) | |
if(is.null(target_page)) { | |
warning("target string not found") | |
return(NULL) | |
} | |
cat("Found keywords on page ", target_page) | |
boundaries <- locate_via_shiny(f, target_page) | |
tabulizer::extract_tables(f, pages=target_page, | |
area=boundaries$area_boundaries, | |
columns = boundaries$columns_boundaries, guess=FALSE, | |
output = "data.frame")[[1]] %>% | |
mutate_if(is.character, function(.)stringr::str_replace_all(., "\\.", "")) | |
} | |
download_and_extract <- function(year){ | |
f <- file.path("pdf", year, filename) | |
dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE) | |
download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f) | |
extract_from_pdf(f, target_str) | |
} | |
# Download the data | |
f <- file.path("pdf", year, filename) | |
dir.create(file.path("pdf", year), showWarnings = FALSE, recursive = FALSE) | |
download.file(paste0("https://www.budget.gov.hk/", year, "/eng/pdf/", filename), f) | |
result <- extract_from_pdf(f, target_str) | |
lapply(2008:2009, download_and_extract) -> all_result |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
target_str <- "An analysis of the financial provision under Subhead 000 Operational expenses is as follows" | |
for(n in 1:tabulizer::get_n_pages(f)){ | |
if(stringr::str_detect(extract_text(f, pages=as.numeric(n)), target_str)){ | |
print(n) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment