Skip to content

Instantly share code, notes, and snippets.

@patperu
Last active July 16, 2016 07:10
Show Gist options
  • Save patperu/41774f681a65de74da4a9075358556e1 to your computer and use it in GitHub Desktop.
Save patperu/41774f681a65de74da4a9075358556e1 to your computer and use it in GitHub Desktop.
Extract data from a PDF
id data region ws01_14 ws11_14
Stadt Zehdenick 2014 13325 79 Land Brandenburg 291 84 Land Brandenburg 291 84
Stadt Zossen 1990 12247 100 Ausland/unbek 169 105 Ausland/unbek 169 105
Stadt Zossen 2000 16310 133 Land Berlin 1056 155 Land Berlin 1056 155
Stadt Zossen 2005 17183 140 Alte Bundesländer -409 -2 Alte Bundesländer -409 -2
Stadt Zossen 2010 17606 144 Neue Bundesländer 177 33 Neue Bundesländer 177 33
Stadt Zossen 2014 17657 144 Land Brandenburg 1210 299 Land Brandenburg 1210 299
library('pdftools')
library('stringi')
options(stringsAsFactors = FALSE)
getWSaldo <- function(x, id) {
k <- sapply(x, "[", id)
k <- trimws(k)
k <- gsub("\\.", "", k)
k <- as.numeric(k)
return(k)}
txt <- pdf_text("rb_00.....pdf")
extract_wsaldo <- function(text, first_page, last_page) {
fin <- list()
for (i in first_page:last_page) {
x <- text[i]
x <- gsub("\r\n", "::", x)
x <- gsub("\\s+", " ", x)
x <- gsub("- ", "-", x)
x <- strsplit(x, "::")[[1]]
x <- data.frame(x, stringsAsFactors = FALSE)
# W-Saldo für zwei Zeiträume
x <- data.frame(id = trimws(x[2, ]), data = x[5:12, ])
x$region <- trimws(stri_extract_all_regex(x$data, "[[:alpha:]][ \\/[:alpha:]]*"))
x <- x[x$region %in% c("Ausland/unbek", "Land Berlin",
"Alte Bundesländer", "Neue Bundesländer",
"Land Brandenburg"), ]
x$data <- stri_replace_all_regex(x$data, "\\.", "")
wsaldo <- stri_extract_all_regex(x$data, "-?[[:digit:]][[:digit:]]*")
wsaldo <- data.frame(ws01_14 = getWSaldo(wsaldo, 4), ws11_14 = getWSaldo(wsaldo, 5))
x <- cbind(x, wsaldo)
fin[[i]] <- x
}
fin <- do.call("rbind", fin)
rownames(fin) <- NULL
fin
}
fin <- extract_wsaldo(txt, 27, 110)
write.csv(tail(fin), file="wsaldo.csv", fileEncoding = "UTF-8", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment