Created
September 16, 2015 13:57
-
-
Save troyhill/d730135b099bdd054c98 to your computer and use it in GitHub Desktop.
Import annual NCDC local climatological data report pdfs to R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NCDC_combine <- function(pdfFiles, pdfToText = "C:\\Program Files\\xpdfbin-win-3.04\\bin64\\pdftotext.exe") { | |
# get pdftotxt from ftp://ftp.foolabs.com/pub/xpdf/xpdfbin-win-3.04.zip | |
# get NCDC Local Climatological Data annual summary pdfs from https://www.ncdc.noaa.gov/IPS/lcd/lcd.html | |
combineWords <- function(input, terms = 9, n = 3, spacer = "_", spacerIn = " ") { | |
# function takes a vector of words with spaces (input), and returns some number of terms (terms), | |
# on the basis of combining sequences of n words | |
splitTerms <- strsplit(input, spacerIn)[[1]] # individual words | |
starts <- seq(from = 1, to = length(splitTerms), by = n) # start of new terms | |
for (i in 1:(length(starts))) { | |
tmpName <- paste(splitTerms[starts[i]:(starts[i] + n - 1)], collapse = spacer) | |
if (i != 1) { | |
output <- c(output, tmpName) | |
} else { | |
output <- tmpName | |
} | |
} | |
output | |
} | |
for (j in 1:length(pdfFiles)) { | |
dest <- pdfFiles[j] | |
# set path to pdftotxt.exe and convert pdf to text | |
exe <- pdfToText | |
system(paste("\"", exe, "\" \"", dest, "\"", sep = ""), wait = F) | |
# get txt-file name and open it | |
filetxt <- sub(".pdf", ".txt", dest) | |
suppressWarnings(txt <- readLines(filetxt)) # don't mind that there warning... | |
### basic info | |
year <- txt[[1]] # not sure this is generalizable | |
stn <- txt[[5]] # not sure this is generalizable | |
tempStartCol <- grep("MEAN DAILY", txt)[1] | |
monthCols <- grep("JAN FEB", txt) | |
tmpElements <- 9 # number of parameters per month - code fills with NAs AT THE TAIL if data is not present. Identifying non-tail data gaps will take more work. | |
mnths <- strsplit(as.character(txt[[monthCols[1]]]), " ")[[1]][1:12] | |
elements <- txt[[tempStartCol]] # most are three words. see https://stackoverflow.com/questions/26497583/split-a-string-every-5-characters for converting to a list of temperature data categories | |
# months are in clusters of three lines between 52 & 100 (ending with number of days with max > 90F) | |
for (i in 1:length(mnths)) { # data is in lines 52:96 | |
dataStart <- tempStartCol + 2 | |
startData <- (dataStart + 4*(i-1)) | |
dataLines <- c(startData:(startData + 2)) # data start every fourth line beginning at 52 | |
newData <- paste(txt[dataLines], collapse = " ") | |
# ***remove non-numeric characters***; some have positive signs | |
newData <- gsub("[^0-9 :.]", "", newData) | |
# now split into a vector | |
newData <- as.numeric(strsplit(newData, " ")[[1]]) | |
if (length(newData) < tmpElements) { | |
missingPoints <- tmpElements - length(newData) | |
newData <- c(newData, rep(NA, times = missingPoints)) | |
} | |
if (i != 1) { | |
dat[, i] <- newData | |
names(dat)[i] <- mnths[i] | |
} else if (i == 1) { | |
dat <- data.frame(colName = newData) | |
names(dat)[i] <- mnths[i] | |
} | |
} | |
### add parameters as rownames | |
### "mean dew point" (tmpElements[9]) is last simple 3-word element | |
for (i in 1:9) { | |
tmpName <- combineWords(elements) | |
if (sum(grep("OCCURRENCE", tmpName[i])) > 0) { | |
appendItem <- substr(rownames(dat)[i - 1], nchar(rownames(dat)[i - 1]) - 6, nchar(rownames(dat)[i - 1]) - 4) | |
rownames(dat)[i] <- paste0(tmpName[i], "_", appendItem) | |
} else { | |
rownames(dat)[i] <- tmpName[i] | |
} | |
} | |
dat <- t(dat) | |
temperatureData <- dat | |
names(temperatureData) <- paste0("TMP_", names(temperatureData)) | |
########## | |
########## Get heating/cooling degree days (rel. to 65 degrees F) | |
########## | |
hcCol <- txt[grep("HEATING", txt)[1]] | |
hcLabels <- combineWords(hcCol, terms = 2) | |
sub <- txt[grep("HEATING", txt)[1]:(grep("HEATING", txt)[1]+ 38)] # not sure this "38" is generalizable | |
# identify terms by number of spaces = 1 | |
hcTerms <- paste(sub[(lapply(gregexpr(" ", sub), length) == 1) & (!sub %in% "")], collapse = " ") | |
pairedNos <- combineWords(hcTerms, terms = 12, n = 2, spacer = " ") # paired nos | |
for (i in 1:length(mnths)) { # data is in lines 52:96 | |
newData <- as.numeric(strsplit(pairedNos[i], " ")[[1]]) # single nos | |
if (i != 1) { | |
dat[, i] <- newData | |
names(dat)[i] <- mnths[i] | |
} else if (i == 1) { | |
dat <- data.frame(colName = newData) | |
names(dat)[i] <- mnths[i] | |
} | |
if (i == length(mnths)) { # add rownames if this is the final month | |
rownames(dat) <- hcLabels | |
} | |
} | |
degreeDays <- t(dat) | |
data_combd <- cbind(temperatureData, degreeDays) | |
########## | |
########## | |
########## | |
########## | |
########## Get precip data | |
########## | |
targ <- grep("PRECIPITATION", txt)[1] | |
namesPPT_pre <- txt[targ] | |
namesPPT_pre <- gsub("WATER EQUIVALENT: ", "PPT ", namesPPT_pre) | |
namesPPT_pre <- gsub("NUMBER OF DAYS WITH: ", "", namesPPT_pre) | |
namesPPT_pre <- gsub("PRECIPITATION", "DAYS OVER", namesPPT_pre) | |
pptLabels <- combineWords(namesPPT_pre, terms = 6) | |
sub <- txt[(targ + 13):(targ + 23)] # first data appears at targ+13; last at targ+23 | |
sub <- sub[nchar(sub) > 10] | |
# have to filter out first 12 temrs for third data line, and split last data line into single digit nos from 12-digit string | |
# data is oriented row-wise; lines 1:3 cover whole rows, line 4 covers three rows | |
for (i in 1:6) { # | |
if (i < 3) { | |
newData <- as.numeric(strsplit(sub[i], " ")[[1]])[1:12] | |
} else if (i == 3) { | |
newData <- strsplit(sub[i], " ")[[1]][1:12] | |
} else if (i == 4) { | |
newData <- as.numeric(strsplit(sub[4], " ")[[1]])[1:12] | |
} else if (i == 5) { | |
newData <- as.numeric(strsplit(sub[4], " ")[[1]])[13:24] | |
} else if (i == 6) { | |
newData <- strsplit(sub[4], " ")[[1]][length(strsplit(sub[4], " ")[[1]])] | |
newData <- as.numeric(substring(newData, seq(1, nchar(newData), 1), seq(1, nchar(newData), 1))) | |
} | |
if (i != 1) { | |
dat[, i] <- newData | |
names(dat)[i] <- pptLabels[i] | |
} else if (i == 1) { | |
dat <- data.frame(colName = newData) | |
names(dat)[i] <- pptLabels[i] | |
} | |
if (i == 6) { | |
rownames(dat) <- mnths | |
} | |
} | |
data_combd <- cbind(data_combd, dat) | |
data_combd$month <- mnths | |
data_combd$year <- year | |
data_combd$moYr <- paste0(data_combd$month, "-", year) | |
rownames(data_combd) <- 1:nrow(data_combd) | |
########## | |
########## | |
########## | |
# combine annual summaries | |
if (j != 1) { | |
finalData <- rbind(finalData, data_combd) | |
} else if (j == 1) { | |
finalData <- data_combd | |
} | |
} | |
finalData | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment