Created
June 17, 2014 17:12
-
-
Save goldingn/ce9dc11a42539341858e to your computer and use it in GitHub Desktop.
Rough sketch of ZOON modular structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# _____________ ____________ ____________ ___________________ | |
# \ ______ \__ _\ ______ \ _\ ______ \ _\ ___ \______ | |
# \ / \ \ \ \ \ \__ _\ \ \ \ \ _\ \ \/ \ \_____ | |
# \ / \ \ \ \ \ \ \ \ \ \ \ \__ _\ \ \ \ \ \____ | |
# \ /______ \ \ \ \_____\ \ \ \ \ \_____\ \ \ \ \ \ \ \ \___ | |
# \___________\__\_\___________\__\_\_\___________\__\_\_\_\___________\__ | |
# \____________\___\_\____________\___\_\_\____________\__ | |
# \____________\_____\_\____________\___ | |
# SPECIES DISTRIBUTION MODELLING \____________\____ | |
# | |
# A PROTOTYPE FOR REPRODUCIBLE, ACCESSIBLE & SHAREABLE SCIENTIFIC OUTPUTS IN R | |
# (Greg, I edited your lovely ASCII art to bring it under the 80 character | |
# line limit for the Google R style guide) | |
# This is a *really* rough sketch of a modular structure for the ZOON workflow. | |
# In this version, a workflow consists of 5 types of modules, representing the | |
# key steps in an SDM workflow: | |
# | |
# occurrence module | |
# covariate module | |
# process module | |
# model module | |
# map module | |
# | |
# The user must select one module for each of these module types. These are | |
# then passed to the workflow wrapper function which runs through each step. | |
# Every module type should have a protocol determining what it takes as input | |
# and what it gives as output. Different modules for each module type could | |
# be stored in a community repositories, enabling users to quickly drop in a | |
# newly developed model or processing step and run it with their data. The | |
# structure also makes it very easy to compare different models against a | |
# standard dataset. | |
# These modules are just functions and wherever possible they should be self- | |
# contained, taking no arguments other than those used in the workflow. | |
# | |
# Clearly this is very simplistic, obvious next steps include modules for | |
# validation statistics and plots, the ability to predict to different | |
# covariate rasters than are used to train the model, etc. | |
# | |
# OK, let's go.... | |
# ~~~~~~~~~~~~ | |
# clear the workspace | |
rm(list = ls()) | |
# ~~~~~~~~~~~~ | |
# define the modules fir each module type | |
# ~~~~~~~~~~~~ | |
# occurrence modules | |
# input: a numeric vetor of length 4 giving the coordinates of the rectangular | |
# region within which to carry out the analysis, in the order: xmin, xmax, | |
# ymin, ymax. | |
# output: a dataframe with four columns: | |
# value - a numeric value which may give 1 for presences, 0 for absences or a | |
# positive integer for count data | |
# type - a character value saying what is in the value column | |
# lon - the longitude of the record | |
# lat - the latitutude of the record | |
# ~~~ | |
# occurrenceCp: | |
# occurrence module to grab *Culex pipiens* (a mosquito) occurrence (i.e. | |
# presence-only) data from GBIF, in the area bounded by extent. | |
# Perhaps this should have temporal interval too for future-proofing? | |
occurrenceCp <- function (extent) { | |
require (dismo) | |
raw <- gbif(genus = 'Culex', | |
species = 'pipiens', | |
ext = extent) | |
occurrence <- raw[, c('lon', 'lat')] | |
occurrence$value <- 1 | |
occurrence$type <- 'presence' | |
return(occurrence) | |
} | |
# ~~~~~~~~~~~~ | |
# covariate modules | |
# input: a numeric vetor of length 4 giving the coordinates of the rectangular | |
# region within which to carry out the analysis, in the order: xmin, xmax, | |
# ymin, ymax. | |
# output: a Raster* object (class from the raster package) with the gridded | |
# covariates used to train and predict from the SDM. | |
# ~~~ | |
# covariateAir: | |
# covariate module to grab a coarse resolution mean air temperature raster from | |
# January-February 2001-2002 for the given extent. | |
covariateAir <- function (extent) { | |
require(RNCEP) | |
c1 <- NCEP.gather(variable = 'air', | |
level = 850, | |
months.minmax = c(1:2), | |
years.minmax = c(2000,2001), | |
lat.southnorth = extent[3:4], | |
lon.westeast = extent[1:2], | |
reanalysis2 = FALSE, | |
return.units = TRUE) | |
avg <- apply(c1, c(1, 2), mean) | |
ras <- raster(avg) | |
extent(ras) <- c(extent) | |
return (ras) | |
} | |
# ~~~~~~~~~~~~ | |
# process modules | |
# input: | |
# occ - occurrence data, the output from an occurrence module | |
# ras - covariate data, the output from a covariate module | |
# output: dataframe with at least 5 columns | |
# value - a numeric value which may give 1 for presences, 0 for absences or a | |
# positive integer for count data | |
# type - a character value saying what is in the value column | |
# lon - the longitude of the record | |
# lat - the latitutude of the record | |
# columns 5-n - the values of the covariates for each records (the names of | |
# these columns should correspond exactly to the names of the layers in | |
# ras) | |
# ~~~ | |
# processA: | |
# process module to generate up to 100 background records at random in | |
# cells of ras and return these along with the ppresence only data. | |
processA <- function (occ, ras) { | |
require (dismo) | |
if (!all(occ$type == 'presence')) { | |
stop ('this function only works for presence-only data') | |
} | |
# generate pseudo-absence data | |
pa <- randomPoints(ras, | |
100) | |
npres <- nrow(occ) | |
npabs <- nrow(pa) | |
# extract covariates | |
occ_covs <- as.matrix(extract(ras, occ[, c('lon', 'lat')])) | |
pa_covs <- as.matrix(extract(ras, pa)) | |
covs <- rbind(occ_covs, | |
pa_covs) | |
# combine with the occurrence data | |
df <- data.frame(value = rep(c(1, 0), | |
c(npres, npabs)), | |
type = rep(c('presence', 'background'), | |
c(npres, npabs)), | |
lon = c(occ$lon, pa[, 1]), | |
lat = c(occ$lat, pa[, 2]), | |
covs) | |
names(df)[5:ncol(df)] <- names(ras) | |
return(df) | |
} | |
# ~~~~~~~~~~~~ | |
# model modules | |
# input: | |
# df - a dataframe, the output from a process module | |
# output: a model object with a valid predict method | |
# note that the current set up only works for models with a predict method | |
# which takes the argument: type = 'response' | |
# obviously we'll have to work around that. | |
# ~~~ | |
# modelLR: | |
# model module to fit a simple logistic regression model | |
modelLR <- function (df) { | |
if (!all(df$type %in% c('presence', 'absence', 'background'))) { | |
stop ('only for presence/absence or presence/background data') | |
} | |
covs <- as.data.frame(df[, 5:ncol(df)]) | |
names(covs) <- names(df)[5:ncol(df)] | |
m <- glm(df$value ~ ., | |
data = covs, | |
family = binomial) | |
return (m) | |
} | |
# modelRF: | |
# model module to fit a simple RandomForest classification model | |
modelRF <- function (df) { | |
require ('randomForest') | |
if (!all(df$type %in% c('presence', 'absence', 'background'))) { | |
stop ('only for presence/absence or presence/background data') | |
} | |
covs <- as.data.frame(df[, 5:ncol(df)]) | |
names(covs) <- names(df)[5:ncol(df)] | |
m <- randomForest(df$value ~ ., | |
data = covs, | |
weights = rep(1, nrow(covs)), | |
size = 1) | |
return (m) | |
} | |
# ~~~~~~~~~~~~ | |
# map modules | |
# input: | |
# model - a model object, the output from a model module | |
# ras - a Raster* object, the output from a covariate module | |
# output: a Raster object giving the probabilistic model predictions for each | |
# cell of ras | |
mapA <- function (model, ras) { | |
vals <- data.frame(getValues(ras)) | |
colnames(vals) <- names(ras) | |
pred <- predict(model, | |
newdata = vals, | |
type = 'response') | |
pred_ras <- ras[[1]] | |
pred_ras <- setValues(pred_ras, pred) | |
return(pred_ras) | |
} | |
# ~~~~~~~~~~~~ | |
# workflow wrapper function | |
# input: an extent and one of each of the modules | |
# output: a list with the extent, the results of each module and a copy of the | |
# code used to execute the workflow (what's there now should be source-able | |
# though I'm sure there is a much neater approach than the one I took - the | |
# ultimate aim would be a much nicer way of enhancing reproducibility). | |
workflow <- function(ext, | |
occurrenceFn, | |
covariateFn, | |
processFn, | |
modelFn, | |
mapFn) { | |
# small helper function to dump the code for a function to a text string | |
getSource <- function (object) { | |
paste(deparse(object), collapse = '\n') | |
} | |
occ <- occurrenceFn(ext) | |
cov <- covariateFn(ext) | |
df <- processFn(occ, cov) | |
m <- modelFn(df) | |
map <- mapFn(m, cov) | |
# get the command used to call this function | |
bits <- sys.call() | |
call <- paste0(bits[1], | |
'(', | |
paste(bits[-1], | |
collapse = ', '), | |
')') | |
# sorry about this spaghetti... | |
workflow <- paste0(paste(bits[-2], | |
c(getSource(workflow), | |
getSource(occurrenceFn), | |
getSource(covariateFn), | |
getSource(processFn), | |
getSource(modelFn), | |
getSource(mapFn)), | |
sep = ' <- ', | |
collapse = '\n\n'), | |
paste0('\n\n', | |
bits[2], | |
' <- c(', | |
paste(ext, collapse = ', '), | |
')\n'), | |
'\nans <- ', | |
call, | |
collapse = '\n\n\n') | |
return(list(extent = extent, | |
occ = occ, | |
cov = cov, | |
df = df, | |
m = m, | |
map = map, | |
workflow = workflow)) | |
} | |
# ~~~~~~~~~~~~~~ | |
# test drive it! | |
# define the extent in lat and long | |
UKextent <- c(xmin = -10, | |
xmax = 10, | |
ymin = 45, | |
ymax = 65) | |
# run a workflow, using the logistic regression model | |
ans1 <- workflow(ext = UKextent, | |
occurrenceFn = occurrenceCp, | |
covariateFn = covariateAir, | |
processFn = processA, | |
modelFn = modelLR, | |
mapFn = mapA) | |
# switch the model to a RandomForest | |
ans2 <- workflow(ext = UKextent, | |
occurrenceFn = occurrenceCp, | |
covariateFn = covariateAir, | |
processFn = processA, | |
modelFn = modelRF, | |
mapFn = mapA) | |
# look at the contents of these lists | |
str(ans1, 1) | |
str(ans2, 1) | |
# plot the resulting maps | |
par(mfrow = c(1, 2)) | |
plot(ans1$map, | |
zlim = c(0,1), | |
main = 'LR') | |
points(ans1$occ[, 1:2], | |
pch = 16, | |
cex = 0.3) | |
plot(ans2$map, | |
zlim = c(0,1), | |
main = 'RF') | |
points(ans2$occ[, 1:2], | |
pch = 16, | |
cex = 0.3) | |
# ~~~~~~~~~~~~~~~ | |
# test run the script it outputted | |
# save the workflow script somewhere | |
write(ans1$workflow, file = '~/Desktop/ans1_code.txt') | |
# clear everything in the workspace | |
rm(list = ls()) | |
# call the script! | |
source('~/Desktop/ans1_code.txt') | |
# plot the first map again | |
par(mfrow = c(1, 1)) | |
plot(ans$map, | |
zlim = c(0,1), | |
main = 'LR') | |
points(ans$occ[, 1:2], | |
pch = 16, | |
cex = 0.3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment