troyhill · September 16, 2015 13:57
diff --git a/NCDC_pdfToR.R b/NCDC_pdfToR.R
 NCDC_combine <- function(pdfFiles, pdfToText = "C:\\Program Files\\xpdfbin-win-3.04\\bin64\\pdftotext.exe") {
  # get pdftotxt from ftp://ftp.foolabs.com/pub/xpdf/xpdfbin-win-3.04.zip
  # get NCDC Local Climatological Data annual summary pdfs from https://www.ncdc.noaa.gov/IPS/lcd/lcd.html
  combineWords <- function(input, terms = 9, n = 3, spacer = "_", spacerIn = " ") {
    # function takes a vector of words with spaces (input), and returns some number of terms (terms),
    # on the basis of combining sequences of n words
    splitTerms <- strsplit(input, spacerIn)[[1]] # individual words
    starts <- seq(from = 1, to = length(splitTerms), by = n) # start of new terms
    for (i in 1:(length(starts))) {
      tmpName <- paste(splitTerms[starts[i]:(starts[i] + n - 1)], collapse = spacer)
      if (i != 1) {
        output <- c(output, tmpName)
      } else {
        output <- tmpName
      }
    }
    output
  }
  
  
  for (j in 1:length(pdfFiles)) {
    dest    <- pdfFiles[j]
    # set path to pdftotxt.exe and convert pdf to text
    exe     <- pdfToText
    system(paste("\"", exe, "\" \"", dest, "\"", sep = ""), wait = F)
    # get txt-file name and open it  
    filetxt <- sub(".pdf", ".txt", dest)
    
    suppressWarnings(txt <- readLines(filetxt)) # don't mind that there warning...
    
    ### basic info
    year         <- txt[[1]] # not sure this is generalizable
    stn          <- txt[[5]] # not sure this is generalizable
    tempStartCol <- grep("MEAN DAILY", txt)[1]
    monthCols    <- grep("JAN FEB", txt)
    tmpElements  <- 9 # number of parameters per month - code fills with NAs AT THE TAIL if data is not present. Identifying non-tail data gaps will take more work.
    
    mnths    <- strsplit(as.character(txt[[monthCols[1]]]), " ")[[1]][1:12]
    elements <- txt[[tempStartCol]] # most are three words. see https://stackoverflow.com/questions/26497583/split-a-string-every-5-characters for converting to a list of temperature data categories
    
    
    # months are in clusters of three lines between 52 & 100 (ending with number of days with max > 90F)
    
    for (i in 1:length(mnths)) { # data is in lines 52:96
      dataStart       <- tempStartCol + 2
      startData       <- (dataStart + 4*(i-1))
      dataLines       <- c(startData:(startData + 2)) # data start every fourth line beginning at 52
      newData         <- paste(txt[dataLines], collapse = " ")
      # ***remove non-numeric characters***; some have positive signs
      newData         <- gsub("[^0-9 :.]", "", newData)
      # now split into a vector
      newData         <- as.numeric(strsplit(newData, " ")[[1]])
      if (length(newData) < tmpElements) {
        missingPoints <- tmpElements - length(newData)
        newData       <- c(newData, rep(NA, times = missingPoints))
      }
      if (i != 1) {
        dat[, i]      <- newData
        names(dat)[i] <- mnths[i]
      } else if (i == 1) {
        dat           <- data.frame(colName = newData)
        names(dat)[i] <- mnths[i]
      }
    }
    
    ### add parameters as rownames
    ### "mean dew point" (tmpElements[9]) is last simple 3-word element
    for (i in 1:9) {
      tmpName            <- combineWords(elements)
      if (sum(grep("OCCURRENCE", tmpName[i])) > 0)  {
        appendItem       <- substr(rownames(dat)[i - 1], nchar(rownames(dat)[i - 1]) - 6, nchar(rownames(dat)[i - 1]) - 4)
        rownames(dat)[i] <- paste0(tmpName[i], "_", appendItem)
      } else {
        rownames(dat)[i] <- tmpName[i]
      }
    }
    dat             <- t(dat)
    temperatureData <- dat
    names(temperatureData) <- paste0("TMP_", names(temperatureData))
    
    ##########
    ########## Get heating/cooling degree days (rel. to 65 degrees F)
    ##########
    hcCol    <- txt[grep("HEATING", txt)[1]]
    hcLabels <- combineWords(hcCol, terms = 2)
    sub      <- txt[grep("HEATING", txt)[1]:(grep("HEATING", txt)[1]+ 38)] # not sure this "38" is generalizable
    
    # identify terms by number of spaces = 1
    hcTerms   <- paste(sub[(lapply(gregexpr(" ", sub), length) == 1) & (!sub %in% "")], collapse = " ")
    pairedNos <- combineWords(hcTerms, terms = 12, n = 2, spacer = " ") # paired nos
    
    for (i in 1:length(mnths)) { # data is in lines 52:96
      newData         <- as.numeric(strsplit(pairedNos[i], " ")[[1]]) # single nos
      if (i != 1) {
        dat[, i]      <- newData
        names(dat)[i] <- mnths[i]
      } else if (i == 1) {
        dat <- data.frame(colName = newData)
        names(dat)[i] <- mnths[i]
      }
      if (i == length(mnths)) { # add rownames if this is the final month
        rownames(dat) <- hcLabels
      }
    }
    degreeDays <- t(dat)
    data_combd <- cbind(temperatureData, degreeDays)
    ##########
    ##########
    ##########
    
    
    
    ##########
    ########## Get precip data
    ##########
    targ         <- grep("PRECIPITATION", txt)[1]
    namesPPT_pre <- txt[targ]
    namesPPT_pre <- gsub("WATER EQUIVALENT: ", "PPT ", namesPPT_pre)
    namesPPT_pre <- gsub("NUMBER OF DAYS WITH: ", "", namesPPT_pre)
    namesPPT_pre <- gsub("PRECIPITATION", "DAYS OVER", namesPPT_pre)
    pptLabels    <- combineWords(namesPPT_pre, terms = 6)
    
    sub <- txt[(targ + 13):(targ + 23)] # first data appears at targ+13; last at targ+23
    sub <- sub[nchar(sub) > 10]
    # have to filter out first 12 temrs for third data line, and split last data line into single digit nos from 12-digit string
    # data is oriented row-wise; lines 1:3 cover whole rows, line 4 covers three rows
    for (i in 1:6) { # 
      if (i < 3) {
        newData <- as.numeric(strsplit(sub[i], " ")[[1]])[1:12]
      } else if (i == 3) {
        newData <- strsplit(sub[i], " ")[[1]][1:12]
      } else if (i == 4) {
        newData <- as.numeric(strsplit(sub[4], " ")[[1]])[1:12]
      } else if (i == 5) {
        newData <- as.numeric(strsplit(sub[4], " ")[[1]])[13:24]
      } else if (i == 6) {
        newData <- strsplit(sub[4], " ")[[1]][length(strsplit(sub[4], " ")[[1]])]
        newData <- as.numeric(substring(newData, seq(1, nchar(newData), 1), seq(1, nchar(newData), 1)))
      }
      if (i != 1) {
        dat[, i] <- newData
        names(dat)[i] <- pptLabels[i]
      } else if (i == 1) {
        dat <- data.frame(colName = newData)
        names(dat)[i] <- pptLabels[i]
      }
      if (i == 6) {
        rownames(dat) <- mnths
      }
    }
    
    data_combd <- cbind(data_combd, dat)
    
    data_combd$month     <- mnths
    data_combd$year      <- year
    data_combd$moYr      <- paste0(data_combd$month, "-", year)
    rownames(data_combd) <- 1:nrow(data_combd)
    ##########
    ##########
    ##########
    # combine annual summaries
    if (j != 1) {
      finalData <- rbind(finalData, data_combd)
    } else if (j == 1) {
      finalData <- data_combd
    }
  }
  finalData
 }
	NCDC_combine <- function(pdfFiles, pdfToText = "C:\\Program Files\\xpdfbin-win-3.04\\bin64\\pdftotext.exe") {
	# get pdftotxt from ftp://ftp.foolabs.com/pub/xpdf/xpdfbin-win-3.04.zip
	# get NCDC Local Climatological Data annual summary pdfs from https://www.ncdc.noaa.gov/IPS/lcd/lcd.html
	combineWords <- function(input, terms = 9, n = 3, spacer = "_", spacerIn = " ") {
	# function takes a vector of words with spaces (input), and returns some number of terms (terms),
	# on the basis of combining sequences of n words
	splitTerms <- strsplit(input, spacerIn)[[1]] # individual words
	starts <- seq(from = 1, to = length(splitTerms), by = n) # start of new terms
	for (i in 1:(length(starts))) {
	tmpName <- paste(splitTerms[starts[i]:(starts[i] + n - 1)], collapse = spacer)
	if (i != 1) {
	output <- c(output, tmpName)
	} else {
	output <- tmpName
	}
	}
	output
	}


	for (j in 1:length(pdfFiles)) {
	dest <- pdfFiles[j]
	# set path to pdftotxt.exe and convert pdf to text
	exe <- pdfToText
	system(paste("\"", exe, "\" \"", dest, "\"", sep = ""), wait = F)
	# get txt-file name and open it
	filetxt <- sub(".pdf", ".txt", dest)

	suppressWarnings(txt <- readLines(filetxt)) # don't mind that there warning...

	### basic info
	year <- txt[[1]] # not sure this is generalizable
	stn <- txt[[5]] # not sure this is generalizable
	tempStartCol <- grep("MEAN DAILY", txt)[1]
	monthCols <- grep("JAN FEB", txt)
	tmpElements <- 9 # number of parameters per month - code fills with NAs AT THE TAIL if data is not present. Identifying non-tail data gaps will take more work.

	mnths <- strsplit(as.character(txt[[monthCols[1]]]), " ")[[1]][1:12]
	elements <- txt[[tempStartCol]] # most are three words. see https://stackoverflow.com/questions/26497583/split-a-string-every-5-characters for converting to a list of temperature data categories


	# months are in clusters of three lines between 52 & 100 (ending with number of days with max > 90F)

	for (i in 1:length(mnths)) { # data is in lines 52:96
	dataStart <- tempStartCol + 2
	startData <- (dataStart + 4*(i-1))
	dataLines <- c(startData:(startData + 2)) # data start every fourth line beginning at 52
	newData <- paste(txt[dataLines], collapse = " ")
	# *remove non-numeric characters*; some have positive signs
	newData <- gsub("[^0-9 :.]", "", newData)
	# now split into a vector
	newData <- as.numeric(strsplit(newData, " ")[[1]])
	if (length(newData) < tmpElements) {
	missingPoints <- tmpElements - length(newData)
	newData <- c(newData, rep(NA, times = missingPoints))
	}
	if (i != 1) {
	dat[, i] <- newData
	names(dat)[i] <- mnths[i]
	} else if (i == 1) {
	dat <- data.frame(colName = newData)
	names(dat)[i] <- mnths[i]
	}
	}

	### add parameters as rownames
	### "mean dew point" (tmpElements[9]) is last simple 3-word element
	for (i in 1:9) {
	tmpName <- combineWords(elements)
	if (sum(grep("OCCURRENCE", tmpName[i])) > 0) {
	appendItem <- substr(rownames(dat)[i - 1], nchar(rownames(dat)[i - 1]) - 6, nchar(rownames(dat)[i - 1]) - 4)
	rownames(dat)[i] <- paste0(tmpName[i], "_", appendItem)
	} else {
	rownames(dat)[i] <- tmpName[i]
	}
	}
	dat <- t(dat)
	temperatureData <- dat
	names(temperatureData) <- paste0("TMP_", names(temperatureData))

	##########
	########## Get heating/cooling degree days (rel. to 65 degrees F)
	##########
	hcCol <- txt[grep("HEATING", txt)[1]]
	hcLabels <- combineWords(hcCol, terms = 2)
	sub <- txt[grep("HEATING", txt)[1]:(grep("HEATING", txt)[1]+ 38)] # not sure this "38" is generalizable

	# identify terms by number of spaces = 1
	hcTerms <- paste(sub[(lapply(gregexpr(" ", sub), length) == 1) & (!sub %in% "")], collapse = " ")
	pairedNos <- combineWords(hcTerms, terms = 12, n = 2, spacer = " ") # paired nos

	for (i in 1:length(mnths)) { # data is in lines 52:96
	newData <- as.numeric(strsplit(pairedNos[i], " ")[[1]]) # single nos
	if (i != 1) {
	dat[, i] <- newData
	names(dat)[i] <- mnths[i]
	} else if (i == 1) {
	dat <- data.frame(colName = newData)
	names(dat)[i] <- mnths[i]
	}
	if (i == length(mnths)) { # add rownames if this is the final month
	rownames(dat) <- hcLabels
	}
	}
	degreeDays <- t(dat)
	data_combd <- cbind(temperatureData, degreeDays)
	##########
	##########
	##########



	##########
	########## Get precip data
	##########
	targ <- grep("PRECIPITATION", txt)[1]
	namesPPT_pre <- txt[targ]
	namesPPT_pre <- gsub("WATER EQUIVALENT: ", "PPT ", namesPPT_pre)
	namesPPT_pre <- gsub("NUMBER OF DAYS WITH: ", "", namesPPT_pre)
	namesPPT_pre <- gsub("PRECIPITATION", "DAYS OVER", namesPPT_pre)
	pptLabels <- combineWords(namesPPT_pre, terms = 6)

	sub <- txt[(targ + 13):(targ + 23)] # first data appears at targ+13; last at targ+23
	sub <- sub[nchar(sub) > 10]
	# have to filter out first 12 temrs for third data line, and split last data line into single digit nos from 12-digit string
	# data is oriented row-wise; lines 1:3 cover whole rows, line 4 covers three rows
	for (i in 1:6) { #
	if (i < 3) {
	newData <- as.numeric(strsplit(sub[i], " ")[[1]])[1:12]
	} else if (i == 3) {
	newData <- strsplit(sub[i], " ")[[1]][1:12]
	} else if (i == 4) {
	newData <- as.numeric(strsplit(sub[4], " ")[[1]])[1:12]
	} else if (i == 5) {
	newData <- as.numeric(strsplit(sub[4], " ")[[1]])[13:24]
	} else if (i == 6) {
	newData <- strsplit(sub[4], " ")[[1]][length(strsplit(sub[4], " ")[[1]])]
	newData <- as.numeric(substring(newData, seq(1, nchar(newData), 1), seq(1, nchar(newData), 1)))
	}
	if (i != 1) {
	dat[, i] <- newData
	names(dat)[i] <- pptLabels[i]
	} else if (i == 1) {
	dat <- data.frame(colName = newData)
	names(dat)[i] <- pptLabels[i]
	}
	if (i == 6) {
	rownames(dat) <- mnths
	}
	}

	data_combd <- cbind(data_combd, dat)

	data_combd$month <- mnths
	data_combd$year <- year
	data_combd$moYr <- paste0(data_combd$month, "-", year)
	rownames(data_combd) <- 1:nrow(data_combd)
	##########
	##########
	##########
	# combine annual summaries
	if (j != 1) {
	finalData <- rbind(finalData, data_combd)
	} else if (j == 1) {
	finalData <- data_combd
	}
	}
	finalData
	}