-
-
Save gavinsimpson/6d05af9186b9f9419cca5a4507af3aa0 to your computer and use it in GitHub Desktop.
## Process precip data for Nuuk | |
## Packages | |
library("tibble") | |
library("readr") | |
library("tidyr") | |
library("curl") | |
## Data - data are at ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/v2.prcp.Z | |
## file is compressed | |
url <- "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/v2.prcp.Z" | |
tmp <- "./v2.prcp.Z" | |
curl_download(url, tmp, quiet = FALSE) | |
## At this point you'll need to uncompress the x-compress file you just downloaded, | |
## and extract the `v2.prcp` file to the working (or other) directory. | |
## There is an R package `uncompress` that could do this for you, but it was removed | |
## from CRAN at some point; the last version archived there is from 2012. | |
path.2.v2.prcp <- "./v2.prcp" | |
## data have 16 chars, then twelve groups of 5 chars | |
## the first 16 chars contain relevant info and need extracting into separate variables on reading | |
precip <- read_fwf(path.2.v2.prcp, | |
fwf_widths(c(3, 5, 3, 1, 4, rep(5, 12)), | |
col_names = c("Country","WMOID","Modifier","Duplicate","Year", month.abb)), | |
na = c("-9999","-8888")) # -8888 means a trace, which I ignore (naughty!) | |
# -9999 is straight up missing | |
## subset only the bits we want | |
precip <- precip[precip$WMOID == 4250L, ] # 4250 is the WMO station ID for Nuuk (Godthaab) | |
## gather the month cols into a single key value pair | |
precip <- gather(precip, Month, Precip, -Country, -WMOID, -Modifier, -Duplicate, -Year) | |
## add a numeric column for month | |
months <- 1:12 | |
names(months) <- month.abb | |
precip <- add_column(precip, nMonth = unname(months[precip$Month]), .after = "Month") | |
## Drop some other stuff | |
precip <- precip[, c("WMOID", "Year", "Month", "nMonth", "Precip")] | |
## Save this to RDS | |
write_rds(precip, "./nuuk-precip-data.rds") |
## Load and analyse precip data for Nuuk | |
## Packages | |
library("mgcv") | |
library("ggplot2") | |
library("viridis") | |
theme_set(theme_bw()) | |
## Data - data are in ./nuuk-precip-data.rds | |
## This file can be created by running ./data-prep.R *interactively* | |
precip <- readRDS("./nuuk-precip-data.rds") | |
## gam.control object | |
ctrl <- gam.control() | |
## ctrl <- gam.control(nthreads = 4) # uncomment if you can use multiple threads | |
## This coerces precip to a data.frame - no tibbles from now on | |
precip <- transform(precip, Precip = Precip / 10) # Precip data are in 1/10 of a mm | |
## model will be a Tweedie GAM | |
## te() version as interaction required | |
m <- gam(Precip ~ te(nMonth, Year, bs = c("cc", "tp"), k = c(12, 35)), | |
data = precip, knots = list(nMonth = c(0.5, 12.5)), | |
family = tw, method = "REML", select = TRUE, control = ctrl) | |
summary(m) | |
gam.check(m) | |
plot(m, scheme = 2, n = 200, tran = family(m)$linkinv) | |
## Look at the fitted model | |
ilink <- family(m)$linkinv # this is the inverse of the link function, as an R function | |
## Generate new data to predict at; to get smooth plots use ~ 100 evenly spaced values for each covariate | |
pdata <- with(precip, expand.grid(Year = seq(min(Year), max(Year), length.out = 100), | |
nMonth = seq(1, 12, length.out = 100))) | |
## Add on the fitted values and SEs *on the log [link] scale* | |
pdata <- cbind(pdata, as.data.frame(predict(m, newdata = pdata, type = "link", se.fit = TRUE))) | |
## Compute 95% confidence interval on link scale & back transform this & model fit to response scale | |
pdata <- transform(pdata, | |
Upper = ilink(fit + (1.96 * se.fit)), | |
Lower = ilink(fit - (1.96 * se.fit)), | |
Fitted = ilink(fit)) | |
## plot | |
p1 <- ggplot(pdata, aes(x = nMonth, y = Fitted, colour = Year, group = factor(Year))) + | |
geom_line() + | |
scale_color_viridis(option = "plasma", end = 0.95) + | |
labs(title = "Modelled change in monthly total precipitation at Nuuk, Greenland, 1874–2016", | |
subtitle = "Increase in rainfall & marked change in within-year distribution", | |
caption = "Data: www.ncdc.noaa.gov/ghcnm/v2.php", | |
x = NULL, | |
y = "Estimated total precipitation (mm)") + | |
scale_x_continuous(breaks = 1:12, minor_breaks = NULL, labels = month.abb) | |
p1 | |
ggsave("modelled-nuuk-rainfall.png", p1, dpi = 100, width = 8.5, height = 5.5) |
Hi Graham,
The gamma is a special case of the family of Tweedie distributions (when p = 2). The advantage of using tw()
is that we can have 0 years (but probably not relevant in most places) but also it allows for different mean variance relationships, where the variance doesn't increase as quickly with the mean as it does in the gamma, and tw()
allows you to estimate the p parameter, so you can account for a whole range of mean variance relationships between Poisson-like and the gamma using tw()
.
Awesome. Thank you very much for the explanation!
Hello, I cannot download the data, does it still exist?
@AliciaHamer The data appear to be here ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v2/ with different naming conventions but perhaps a nicer compression system such that we can open them directly from within R. If you try the new source and it doesn't work (i.e. the internal file structure has changed), let me know and I'll take a look at updating this to provide working code.
Hi Gavin,
Very helpful post. Just curious, why the selection of tweedie distribution? My understanding is this distribution is good for positive data, where potentially lots of zeros - as there may be when precipitation data is broken down beyond annual values? I always like to get a sense of climate trends in my study regions, just to see what has gone on over the past ~80 - 100 yrs, but these data are usually analyzed on an annual time scale. In which case, a Gamma distribution could be reasonable?
Cheers