Skip to content

Instantly share code, notes, and snippets.

@troyhill
Created September 16, 2015 13:57
Show Gist options
  • Save troyhill/d730135b099bdd054c98 to your computer and use it in GitHub Desktop.
Save troyhill/d730135b099bdd054c98 to your computer and use it in GitHub Desktop.
Import annual NCDC local climatological data report pdfs to R
NCDC_combine <- function(pdfFiles, pdfToText = "C:\\Program Files\\xpdfbin-win-3.04\\bin64\\pdftotext.exe") {
# get pdftotxt from ftp://ftp.foolabs.com/pub/xpdf/xpdfbin-win-3.04.zip
# get NCDC Local Climatological Data annual summary pdfs from https://www.ncdc.noaa.gov/IPS/lcd/lcd.html
combineWords <- function(input, terms = 9, n = 3, spacer = "_", spacerIn = " ") {
# function takes a vector of words with spaces (input), and returns some number of terms (terms),
# on the basis of combining sequences of n words
splitTerms <- strsplit(input, spacerIn)[[1]] # individual words
starts <- seq(from = 1, to = length(splitTerms), by = n) # start of new terms
for (i in 1:(length(starts))) {
tmpName <- paste(splitTerms[starts[i]:(starts[i] + n - 1)], collapse = spacer)
if (i != 1) {
output <- c(output, tmpName)
} else {
output <- tmpName
}
}
output
}
for (j in 1:length(pdfFiles)) {
dest <- pdfFiles[j]
# set path to pdftotxt.exe and convert pdf to text
exe <- pdfToText
system(paste("\"", exe, "\" \"", dest, "\"", sep = ""), wait = F)
# get txt-file name and open it
filetxt <- sub(".pdf", ".txt", dest)
suppressWarnings(txt <- readLines(filetxt)) # don't mind that there warning...
### basic info
year <- txt[[1]] # not sure this is generalizable
stn <- txt[[5]] # not sure this is generalizable
tempStartCol <- grep("MEAN DAILY", txt)[1]
monthCols <- grep("JAN FEB", txt)
tmpElements <- 9 # number of parameters per month - code fills with NAs AT THE TAIL if data is not present. Identifying non-tail data gaps will take more work.
mnths <- strsplit(as.character(txt[[monthCols[1]]]), " ")[[1]][1:12]
elements <- txt[[tempStartCol]] # most are three words. see https://stackoverflow.com/questions/26497583/split-a-string-every-5-characters for converting to a list of temperature data categories
# months are in clusters of three lines between 52 & 100 (ending with number of days with max > 90F)
for (i in 1:length(mnths)) { # data is in lines 52:96
dataStart <- tempStartCol + 2
startData <- (dataStart + 4*(i-1))
dataLines <- c(startData:(startData + 2)) # data start every fourth line beginning at 52
newData <- paste(txt[dataLines], collapse = " ")
# ***remove non-numeric characters***; some have positive signs
newData <- gsub("[^0-9 :.]", "", newData)
# now split into a vector
newData <- as.numeric(strsplit(newData, " ")[[1]])
if (length(newData) < tmpElements) {
missingPoints <- tmpElements - length(newData)
newData <- c(newData, rep(NA, times = missingPoints))
}
if (i != 1) {
dat[, i] <- newData
names(dat)[i] <- mnths[i]
} else if (i == 1) {
dat <- data.frame(colName = newData)
names(dat)[i] <- mnths[i]
}
}
### add parameters as rownames
### "mean dew point" (tmpElements[9]) is last simple 3-word element
for (i in 1:9) {
tmpName <- combineWords(elements)
if (sum(grep("OCCURRENCE", tmpName[i])) > 0) {
appendItem <- substr(rownames(dat)[i - 1], nchar(rownames(dat)[i - 1]) - 6, nchar(rownames(dat)[i - 1]) - 4)
rownames(dat)[i] <- paste0(tmpName[i], "_", appendItem)
} else {
rownames(dat)[i] <- tmpName[i]
}
}
dat <- t(dat)
temperatureData <- dat
names(temperatureData) <- paste0("TMP_", names(temperatureData))
##########
########## Get heating/cooling degree days (rel. to 65 degrees F)
##########
hcCol <- txt[grep("HEATING", txt)[1]]
hcLabels <- combineWords(hcCol, terms = 2)
sub <- txt[grep("HEATING", txt)[1]:(grep("HEATING", txt)[1]+ 38)] # not sure this "38" is generalizable
# identify terms by number of spaces = 1
hcTerms <- paste(sub[(lapply(gregexpr(" ", sub), length) == 1) & (!sub %in% "")], collapse = " ")
pairedNos <- combineWords(hcTerms, terms = 12, n = 2, spacer = " ") # paired nos
for (i in 1:length(mnths)) { # data is in lines 52:96
newData <- as.numeric(strsplit(pairedNos[i], " ")[[1]]) # single nos
if (i != 1) {
dat[, i] <- newData
names(dat)[i] <- mnths[i]
} else if (i == 1) {
dat <- data.frame(colName = newData)
names(dat)[i] <- mnths[i]
}
if (i == length(mnths)) { # add rownames if this is the final month
rownames(dat) <- hcLabels
}
}
degreeDays <- t(dat)
data_combd <- cbind(temperatureData, degreeDays)
##########
##########
##########
##########
########## Get precip data
##########
targ <- grep("PRECIPITATION", txt)[1]
namesPPT_pre <- txt[targ]
namesPPT_pre <- gsub("WATER EQUIVALENT: ", "PPT ", namesPPT_pre)
namesPPT_pre <- gsub("NUMBER OF DAYS WITH: ", "", namesPPT_pre)
namesPPT_pre <- gsub("PRECIPITATION", "DAYS OVER", namesPPT_pre)
pptLabels <- combineWords(namesPPT_pre, terms = 6)
sub <- txt[(targ + 13):(targ + 23)] # first data appears at targ+13; last at targ+23
sub <- sub[nchar(sub) > 10]
# have to filter out first 12 temrs for third data line, and split last data line into single digit nos from 12-digit string
# data is oriented row-wise; lines 1:3 cover whole rows, line 4 covers three rows
for (i in 1:6) { #
if (i < 3) {
newData <- as.numeric(strsplit(sub[i], " ")[[1]])[1:12]
} else if (i == 3) {
newData <- strsplit(sub[i], " ")[[1]][1:12]
} else if (i == 4) {
newData <- as.numeric(strsplit(sub[4], " ")[[1]])[1:12]
} else if (i == 5) {
newData <- as.numeric(strsplit(sub[4], " ")[[1]])[13:24]
} else if (i == 6) {
newData <- strsplit(sub[4], " ")[[1]][length(strsplit(sub[4], " ")[[1]])]
newData <- as.numeric(substring(newData, seq(1, nchar(newData), 1), seq(1, nchar(newData), 1)))
}
if (i != 1) {
dat[, i] <- newData
names(dat)[i] <- pptLabels[i]
} else if (i == 1) {
dat <- data.frame(colName = newData)
names(dat)[i] <- pptLabels[i]
}
if (i == 6) {
rownames(dat) <- mnths
}
}
data_combd <- cbind(data_combd, dat)
data_combd$month <- mnths
data_combd$year <- year
data_combd$moYr <- paste0(data_combd$month, "-", year)
rownames(data_combd) <- 1:nrow(data_combd)
##########
##########
##########
# combine annual summaries
if (j != 1) {
finalData <- rbind(finalData, data_combd)
} else if (j == 1) {
finalData <- data_combd
}
}
finalData
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment