Last active
July 16, 2016 07:10
-
-
Save patperu/41774f681a65de74da4a9075358556e1 to your computer and use it in GitHub Desktop.
Extract data from a PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
id | data | region | ws01_14 | ws11_14 | |
---|---|---|---|---|---|
Stadt Zehdenick | 2014 13325 79 Land Brandenburg 291 84 | Land Brandenburg | 291 | 84 | |
Stadt Zossen | 1990 12247 100 Ausland/unbek 169 105 | Ausland/unbek | 169 | 105 | |
Stadt Zossen | 2000 16310 133 Land Berlin 1056 155 | Land Berlin | 1056 | 155 | |
Stadt Zossen | 2005 17183 140 Alte Bundesländer -409 -2 | Alte Bundesländer | -409 | -2 | |
Stadt Zossen | 2010 17606 144 Neue Bundesländer 177 33 | Neue Bundesländer | 177 | 33 | |
Stadt Zossen | 2014 17657 144 Land Brandenburg 1210 299 | Land Brandenburg | 1210 | 299 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library('pdftools') | |
library('stringi') | |
options(stringsAsFactors = FALSE) | |
getWSaldo <- function(x, id) { | |
k <- sapply(x, "[", id) | |
k <- trimws(k) | |
k <- gsub("\\.", "", k) | |
k <- as.numeric(k) | |
return(k)} | |
txt <- pdf_text("rb_00.....pdf") | |
extract_wsaldo <- function(text, first_page, last_page) { | |
fin <- list() | |
for (i in first_page:last_page) { | |
x <- text[i] | |
x <- gsub("\r\n", "::", x) | |
x <- gsub("\\s+", " ", x) | |
x <- gsub("- ", "-", x) | |
x <- strsplit(x, "::")[[1]] | |
x <- data.frame(x, stringsAsFactors = FALSE) | |
# W-Saldo für zwei Zeiträume | |
x <- data.frame(id = trimws(x[2, ]), data = x[5:12, ]) | |
x$region <- trimws(stri_extract_all_regex(x$data, "[[:alpha:]][ \\/[:alpha:]]*")) | |
x <- x[x$region %in% c("Ausland/unbek", "Land Berlin", | |
"Alte Bundesländer", "Neue Bundesländer", | |
"Land Brandenburg"), ] | |
x$data <- stri_replace_all_regex(x$data, "\\.", "") | |
wsaldo <- stri_extract_all_regex(x$data, "-?[[:digit:]][[:digit:]]*") | |
wsaldo <- data.frame(ws01_14 = getWSaldo(wsaldo, 4), ws11_14 = getWSaldo(wsaldo, 5)) | |
x <- cbind(x, wsaldo) | |
fin[[i]] <- x | |
} | |
fin <- do.call("rbind", fin) | |
rownames(fin) <- NULL | |
fin | |
} | |
fin <- extract_wsaldo(txt, 27, 110) | |
write.csv(tail(fin), file="wsaldo.csv", fileEncoding = "UTF-8", row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment