mdsumner · August 1, 2017 00:17
diff --git a/efficient-raster-extract.R b/efficient-raster-extract.R
 ## 80 polygons collectively covering 18k cells in a [681,841] grid
 shp_spatial <- as(shp_tab, "Spatial")
 ## variable has 20k layers (modelled climate metric)
 ## 681, 841, 20587
 filename <- "/path/to/hideous/monstrosity40Gb.nc"
 ## it's a regular grid, in longlat/WGS84
 tbrick <- raster::brick(filename, quick = TRUE, varname = "monster1")

 ## build cell / polygon mapping
 ## devtools::install_github("hypertidy/tabularaster")
 ## we totally rely on regular grid here, raster::cellFrom* is affine-only
 cell <- tabularaster::cellnumbers(tbrick, shp_spatial)
 ## guesstimate at 3 hours for traditional loop over stack layers with cell number extract
 #for (i in seq_len(nlayers(tbrick))) x[[i]] <- raster::extract(tbrick[[i]], cell$cell_)

 library(future)
 plan(multiprocess)
 ## function to be applied in parallel (all managed by future-plan)
 fun1 <- function(ilayer) {
  ## main efficiency if pre-calc cell numbers, with grouping-ID for polygon object
  raster::extract(tbrick[[ilayer]], cell$cell_)
 }

 system.time({
  x <- future_lapply(seq_len(nlayers(tbrick)), fun1)
 })
 ## 12 cores
 #user    system   elapsed 
 #11839.320   808.560  1220.962 

 ## YMMV - much depends on the grid dimensions, and its layout on disk - single-cell all-time
 ## extraction is prohibitively slow in this particular case, see ?raster::writeRaster for its 
 ## terminology here in raw files (BSQ, BIL, BIP) - package ff handles these with ease, but
 ## converting 40Gb of climate model output is rarely a practical workflow to otimize extraction
 ## - i.e. the extraction scheme really matters
	## 80 polygons collectively covering 18k cells in a [681,841] grid
	shp_spatial <- as(shp_tab, "Spatial")
	## variable has 20k layers (modelled climate metric)
	## 681, 841, 20587
	filename <- "/path/to/hideous/monstrosity40Gb.nc"
	## it's a regular grid, in longlat/WGS84
	tbrick <- raster::brick(filename, quick = TRUE, varname = "monster1")

	## build cell / polygon mapping
	## devtools::install_github("hypertidy/tabularaster")
	## we totally rely on regular grid here, raster::cellFrom* is affine-only
	cell <- tabularaster::cellnumbers(tbrick, shp_spatial)
	## guesstimate at 3 hours for traditional loop over stack layers with cell number extract
	#for (i in seq_len(nlayers(tbrick))) x[[i]] <- raster::extract(tbrick[[i]], cell$cell_)

	library(future)
	plan(multiprocess)
	## function to be applied in parallel (all managed by future-plan)
	fun1 <- function(ilayer) {
	## main efficiency if pre-calc cell numbers, with grouping-ID for polygon object
	raster::extract(tbrick[[ilayer]], cell$cell_)
	}

	system.time({
	x <- future_lapply(seq_len(nlayers(tbrick)), fun1)
	})
	## 12 cores
	#user system elapsed
	#11839.320 808.560 1220.962

	## YMMV - much depends on the grid dimensions, and its layout on disk - single-cell all-time
	## extraction is prohibitively slow in this particular case, see ?raster::writeRaster for its
	## terminology here in raw files (BSQ, BIL, BIP) - package ff handles these with ease, but
	## converting 40Gb of climate model output is rarely a practical workflow to otimize extraction
	## - i.e. the extraction scheme really matters