Last active
March 22, 2023 16:10
-
-
Save priscian/b322ea37e0678882db5a232a5b9c8bc5 to your computer and use it in GitHub Desktop.
Find the Differences between USHCN Raw & Homogenized Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Find the Differences between USHCN Raw & Homogenized Data. | |
### Inspired by: https://moyhu.blogspot.com/2014/05/nonsense-plots-of-ushcn-adjustments.html | |
rm(list = ls(all.names = TRUE)) | |
library(pacman) # install.packages("pacman") | |
#library(climeseries) # devtools::install_github("priscian/climeseries") | |
jjmisc::reload_all("climeseries", redocument = FALSE) | |
p_load(R.utils, matrixStats, tictoc) | |
tic("USHCN raw v. final") | |
dataDir <- getOption("climeseries_data_dir") # Put your favorite data directory here! | |
archiveNames <- c("ushcn.tavg.latest.raw.tar.gz", "ushcn.tavg.latest.FLs.52j.tar.gz"); temp_type <- "Average" # Average data | |
#archiveNames <- c("ushcn.tmax.latest.raw.tar.gz", "ushcn.tmax.latest.FLs.52j.tar.gz"); temp_type <- "Maximum" # Average data | |
## ftp://ftp.ncdc.noaa.gov/pub/data/ushcn/v2.5/readme.txt | |
archivePaths <- paste("ftp://ftp.ncdc.noaa.gov/pub/data/ushcn/v2.5", archiveNames, sep = "/") | |
fileList <- sapply(archivePaths, | |
function(x, download) | |
{ | |
archiveName <- paste(dataDir, basename(x), sep = "/") | |
if (download) { | |
download.file(x, archiveName, mode = "wb", quiet = TRUE) | |
untar(archiveName, compressed = TRUE, exdir = dataDir, list = FALSE) # extras = "--overwrite" not supported | |
} | |
untar(archiveName, compressed = TRUE, list = TRUE) | |
}, download = TRUE, simplify = FALSE) # 'download = FALSE' to use existing archives. | |
temp <- sapply(fileList, function(x) paste(dataDir, x, sep = "/"), simplify = FALSE) | |
## https://stackoverflow.com/questions/46147901/r-test-if-a-file-exists-and-is-not-a-directory | |
fl <- sapply(temp, function(a) a[Vectorize(file_test, vectorize.args = "x")("-f", a)], simplify = FALSE) | |
surl <- "ftp://ftp.ncdc.noaa.gov/pub/data/ushcn/v2.5/ushcn-v2.5-stations.txt" | |
stations <- read.fwf(surl, widths = c(11, 9, 10, 7, 3, 32, 7, 6, 7, 3), comment.char = "", stringsAsFactors = FALSE) | |
## Make list of raw/final data frames for each station in same wide format as original text file. | |
d <- sapply(fl, | |
function(x) | |
{ | |
sapply(x, | |
function(y) | |
{ | |
z <- read.fwf(y, c(11, 1, 4, rep(9, 12)), stringsAsFactors = FALSE); z <- z[, -2] # Some archives have an extra no. after the year! | |
flit <- (gsub("\\s+\\d+", "", trimws(sapply(z[, -(1:2)], function(a) gsub("([A-Za-z]\\d*)", "", a))))) | |
is.na(flit) <- flit == -9999 | |
#r <- dataframe(z[, 1:2], fahr_to_celsius(data.matrix(dataframe(flit)) / 100.0)) | |
## N.B. Conversion is unnecessary; temps are already in °C. V. the README file. | |
r <- dataframe(z[, 1:2], data.matrix(dataframe(flit)) / 100.0) | |
flit <- stations[stations$V1 == unique(r[[1]])[1], ] | |
attr(r, "location") <- list(lat = flit$V2, long = flit$V3, place = paste(trimws(flit$V6), trimws(flit$V5), sep = ", ")) | |
r | |
}, simplify = FALSE) | |
}, simplify = FALSE) | |
## Make list of raw/final data tables for each station in long format (year, month, temperature). | |
e <- sapply(d, | |
function(x) | |
{ | |
sapply(x, | |
function(y) | |
{ | |
siteId <- unique(y$V1) | |
z <- reshape2::melt(y[, 2L:14L], id.vars = "V3", variable.name = "month", value.name = siteId) | |
z$month <- as.numeric(z$month) | |
names(z)[names(z) == "V3"] <- "year" | |
z <- tbl_dt(dplyr::arrange(z, year, month)) | |
attr(z[[siteId]], "location") <- attr(y, "location") | |
z | |
}, simplify = FALSE) | |
}, simplify = FALSE) | |
## Make list of raw/final wide data tables of all long individual stations by merging them on month & year. | |
g <- sapply(e, | |
function(a) | |
{ | |
#Reduce(merge_fun_factory(all = TRUE, by = c("year", "month")), a) | |
Reduce(function(x, y) { dplyr::full_join(x, y, by = c("year", "month")) }, a) | |
}, simplify = FALSE) | |
## Calculate anomalies from some baseline for all stations. | |
baseline <- 1951:1980 | |
#baseline <- 1981:2010 | |
h <- sapply(g, function(a) recenter_anomalies(a, baseline = baseline), simplify = FALSE) | |
## Create raw/final list of regional/planetary grids of latitude-weighted cells & populate them with station data. | |
o <- sapply(h, | |
function(x) | |
{ | |
## Create global grid of 2.5° × 3.5° squares and bin temp values in the correct square. | |
p <- make_planetary_grid(grid_size = c(2.5, 3.5)) | |
y <- x[, get_climate_series_names(x), with = FALSE] | |
dev_null <- sapply(seq(NCOL(y)), | |
function(z) | |
{ | |
#cat(z, fill = TRUE) | |
elms <- y[, z, with = FALSE] | |
coords <- attr(elms[[1]], "location") | |
lat <- coords[["lat"]]; long <- coords[["long"]] | |
rc <- find_planetary_grid_square(p, lat, long) | |
if (any(is.na(rc))) return () | |
sq <- p[[rc["row"], rc["col"]]][[1]] | |
if (!is.data.frame(sq)) | |
p[[rc["row"], rc["col"]]][[1]] <<- elms | |
else | |
p[[rc["row"], rc["col"]]][[1]] <<- cbind(sq, elms) | |
nop() | |
}, simplify = FALSE) | |
## Check grid-cell contents before averaging: | |
## p[sapply(p, function(x) is.data.frame(x[[1]]))] # Which grid cells are populated? | |
## as.data.frame(p[<element_number>][[1]][[1]]) # To look at the data for cell <element_number> | |
## Create weighted average for each month for each grid cell. | |
dev_null <- sapply(seq(length(p)), | |
function(z) | |
{ | |
pDT <- p[z][[1]][[1]] | |
if (is.data.frame(pDT)) { | |
## https://stackoverflow.com/questions/31258547/data-table-row-wise-sum-mean-min-max-like-dplyr | |
p[z][[1]][[1]] <<- pDT[, `:=`(mean = rowMeans(.SD, na.rm = TRUE))][, .(mean)] | |
} | |
nop() | |
}, simplify = FALSE) | |
weights <- NULL | |
r <- data.matrix(Reduce(cbind, sapply(seq(length(p)), | |
function(z) | |
{ | |
if (!is.data.frame(p[z][[1]][[1]])) return (NULL) | |
weights <<- c(weights, attr(p[z][[1]], "weight")) | |
p[z][[1]][[1]][[1]] | |
}, simplify = FALSE))) | |
rr <- plyr::aaply(r, 1, stats::weighted.mean, w = weights, na.rm = TRUE) | |
is.na(rr) <- is.nan(rr) | |
x[, 1:2][, temp := rr] | |
}, simplify = FALSE) | |
## Create raw/final list of long data frames of average temps for the entire region. | |
r <- sapply(o, | |
function(x) { | |
yearRange <- range(x$year) | |
r <- base::merge(expand.grid(month = 1:12, year = seq(yearRange[1], yearRange[2])), x, by = c("year", "month"), all = TRUE) | |
#r$yr_part <- r$year + (r$month - 0.5)/12; r$met_year <- NA | |
as.data.frame(r) | |
}, simplify = FALSE) | |
series <- c("USHCN " %_% temp_type %_% " Raw", "USHCN " %_% temp_type %_% " Final") | |
for (i in seq(length(r))) { | |
names(r[[i]])[names(r[[i]]) == "temp"] <- series[i] | |
} | |
## Finally, merge raw/final data frames on year & month to produce a single data frame of the regional raw/final temps. | |
s <- Reduce(merge_fun_factory(all = TRUE, by = c("year", "month")), r) | |
s <- recenter_anomalies(s, baseline) | |
diffSeries <- "USHCN Final minus Raw" | |
s[[diffSeries]] <- s[[series[2]]] - s[[series[1]]] | |
s$yr_part <- s$year + (s$month - 0.5)/12; s$met_year <- NA | |
## How does this compare to the NCEI US temps? | |
u <- get_climate_data(download = FALSE, baseline = FALSE) | |
#u <- merge(u, s[, c("year", "month", get_climate_series_names(s))], all = TRUE) # Or: | |
u <- Reduce(merge_fun_factory(by = c("year", "month"), all = TRUE), list(u, s[, c("year", "month", get_climate_series_names(s))])) | |
compSeries <- c("NCEI US Avg. Temp.", "USHCN Average Final") | |
#compSeries <- c("NCEI US Max. Temp.", "USHCN Maximum Final") | |
toc() | |
# save.image("C:/common/data/climate/climeseries/ushcn-ave-raw-vs-final_20190206.RData") | |
# save.image("C:/common/data/climate/climeseries/ushcn-max-raw-vs-final_20190206.RData") | |
plot_climate_data(s, c(series, diffSeries), 1890, yearly = TRUE, col = c("blue", "orangered", "black"), save_png = FALSE) | |
plot_climate_data(s, c(series, diffSeries), 1890, yearly = TRUE, col = c("blue", "orangered", "black"), baseline = 1986:2015, save_png = FALSE) | |
plot_climate_data(s, series[2], 1890, yearly = TRUE, col = "blue", loess = TRUE, loess... = list(span = 0.4), save_png = FALSE) | |
## Plot 5-year MA | |
plot_climate_data(s, c(series), 1890, ma = 60, yearly = FALSE, col = c("blue", "orangered", "black"), baseline = 1986:2015, save_png = FALSE) | |
plot_climate_data(u, compSeries, 1895, yearly = TRUE, baseline = baseline, col = c("blue", "red"), save_png = FALSE) | |
## Plot just the diff. | |
plot_climate_data(u, diffSeries, 1895, yearly = TRUE, baseline = baseline, col = c("blue", "red"), ylim = c(-1, 1), save_png = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment