Skip to content

Instantly share code, notes, and snippets.

@mdsumner
Created August 3, 2025 02:45
Show Gist options
  • Save mdsumner/2a7817917d7dbefaa2168cfda5ee1760 to your computer and use it in GitHub Desktop.
Save mdsumner/2a7817917d7dbefaa2168cfda5ee1760 to your computer and use it in GitHub Desktop.

do you need 500 slightly overlapping Zarr datasets?

src <- "/vsicurl/https://projects.pawsey.org.au/idea-gebco-tif/GEBCO_2024.tif"
src <- "https://projects.pawsey.org.au/idea-gebco-tif/GEBCO_2024.tif"
library(purrr) ## purrr CRAN
library(mirai) ## mirai CRAN
if (!file.exists(basename(src))) {
  curl::curl_download(src, basename(src))  ## curl CRAN
}
info <- vapour::vapour_raster_info(basename(src))  ## vapour CRAN
g <- grout::grout(info$dimension, info$extent, info$block * 6) ## devtols::install_github("hypertidy/grout")
idx <- grout::tile_index(g)

dir.create("zarr")
idx$lab <- sprintf("zarr/tile_%i_%i_%i.zarr", idx$tile, idx$tile_col, idx$tile_row)

mirai::daemons(0)
mirai::daemons(31)
fun <- in_parallel(function(.x) {
  b <- 1.0
  src <- "GEBCO_2024.tif"
  
  cmd <- "gdal raster clip --input %s --bbox %s --output-format ZARR --output %s --allow-bbox-outside-source"
  e <- unlist(.x[, c("xmin", "xmax", "ymin", "ymax")])
  bbox <- e[c(1, 3, 2, 4)] + b * c(-1, 1, -1, 1)
  if (file.exists(.x$lab)) return(NULL)
  system(sprintf(cmd, src, paste0(bbox, collapse = ","), .x$lab))
  NULL
})

walk(split(idx, 1:nrow(idx)), fun)
@mdsumner
Copy link
Author

mdsumner commented Aug 3, 2025

update to prevent out of bounds clipping, and use Zarr 3 and ZSTD

## A 4.4Gb COG of global topography, we download it because the server can't handle parallel requests fast enough
#src <- "/vsicurl/https://projects.pawsey.org.au/idea-gebco-tif/GEBCO_2024.tif"
src <- "https://projects.pawsey.org.au/idea-gebco-tif/GEBCO_2024.tif"
if (!file.exists(basename(src))) {
  curl::curl_download(src, basename(src))  ## curl CRAN
}

## purrr map() with async mirai to generate Zarr in parallel 
library(purrr) ## purrr CRAN
library(mirai) ## mirai CRAN

## basic gdal info (dim, extent, crs)
info <- vapour::vapour_raster_info(basename(src))  ## vapour CRAN

## a tiling scheme, based on a multiple of the COG's blocksize 
g <- grout::grout(info$dimension, info$extent, info$block * 6) ## devtools::install_github("hypertidy/grout")
## we index the tiling, we have a table of offsets, sizes, extents, etc. 
idx <- grout::tile_index(g)

## set up directory for Zarr output and directory names
dir.create("zarr")
idx$lab <- sprintf("zarr/tile_%i_%i_%i.zarr", idx$tile, idx$tile_col, idx$tile_row)

## set up async
mirai::daemons(0)
mirai::daemons(31)

## function to generate a Zarr from a tile specification (row from the idx table)
fun <- in_parallel(function(.x) {
  b <- 1.0
  src <- "GEBCO_2024.tif"
  
  ## we use the cli but could use as api with gdalraster (or osgeo.gdal)
  cmd <- "gdal raster clip --input %s --bbox %s --output-format ZARR --output %s --co COMPRESS=ZSTD --co FORMAT=ZARR_V3  "
  e <- unlist(.x[, c("xmin", "xmax", "ymin", "ymax")])
  bbox <- e[c(1, 3, 2, 4)] + b * c(-1, -1, 1, 1)
  if (bbox[1] < -180) {
    bbox[1] <- -180
  }
  if (bbox[3] >180) {
    bbox[3] <- 180
  }
  if (bbox[2] < -90) {
    bbox[2] <- -90
  }
  if (bbox[4] > 90) {
    bbox[4] <- 90
  }
  
  
  if (file.exists(.x$lab)) return(NULL)
  system(sprintf(cmd, src, paste0(bbox, collapse = ","), .x$lab))
  NULL
})
#fun(idx[1, ])
system.time(walk(split(idx, 1:nrow(idx)), fun))

plot the overlap

vaster::plot_extent(c(-180, 180, -90, 90), lwd = 2, lty = 2, border   = "firebrick")
lapply(split(idx[c("xmin", "xmax", "ymin", "ymax")], 1:nrow(idx)), function(x) 
{
  b <- 1
  bbox <- x[c(1, 3, 2, 4)] + b * c(-1, -1, 1, 1)
  vaster::plot_extent(bbox[c(1, 3, 2, 4)], add = TRUE)
}
  )
image

@mdsumner
Copy link
Author

mdsumner commented Aug 3, 2025

Here I've taken out the old vapour package, using gdalraster instead and remove the cli calls:

src <- "https://projects.pawsey.org.au/idea-gebco-tif/GEBCO_2024.tif"
library(purrr) ## purrr CRAN
library(mirai) ## mirai CRAN
if (!file.exists(basename(src))) {
  curl::curl_download(src, basename(src))  ## curl CRAN
}
ds <- new(gdalraster::GDALRaster, "GEBCO_2024.tif")
dm <- ds$dim()[1:2]
ex <- ds$bbox()[c(1, 3, 2, 4)]
block <-  ds$getBlockSize(1L)

g <- grout::grout(dm, ex, block * 6) ## devtols::install_github("hypertidy/grout")
idx <- grout::tile_index(g)

dir.create("zarr")
idx$lab <- sprintf("zarr/tile_%i_%i_%i.zarr", idx$tile, idx$tile_col, idx$tile_row)

mirai::daemons(0)
mirai::daemons(32)


translate <- in_parallel(function(.x) {
  b <- 1.0
  src <- "GEBCO_2024.tif"

  e <- unlist(.x[, c("xmin", "xmax", "ymin", "ymax")])
  bbox <- e[c(1, 3, 2, 4)] + b * c(-1, -1, 1, 1)
  if (bbox[1] < -180) {
    bbox[1] <- -180
  }
  if (bbox[3] >180) {
    bbox[3] <- 180
  }
  if (bbox[2] < -90) {
    bbox[2] <- -90
  }
  if (bbox[4] > 90) {
    bbox[4] <- 90
  }
  
  dsn <- sprintf("vrt://GEBCO_2024.tif?projwin=%f,%f,%f,%f", bbox[1], bbox[4], bbox[3], bbox[2])
  
  if (file.exists(.x$lab)) return(NULL)
  gdalraster::translate(dsn, .x$lab, cl_arg = c("-of", "ZARR", "-co", "COMPRESS=ZSTD", "-co", "FORMAT=ZARR_V3"), quiet = TRUE)
  NULL
})
#fun(idx[1, ])
system.time(walk(split(idx, 1:nrow(idx)), translate))

gdalraster::set_config_option("GDAL_NUM_THREADS", "ALL_CPUS")
gdalraster::warp(fs::dir_ls("zarr", type = "d", regexp = ".*\\.zarr$"), "warp.tif", quiet = TRUE, 
                 t_srs = "EPSG:4326", cl_arg = c("-te", "-180", "-90", "180", "90", "-ts", "86400", "43200", 
                 "-co", "BIGTIFF=YES", "-co",  "COMPRESS=ZSTD"))


I made a VRT mosaic to see what the inferred georeferencing is and it's better to use the explicit one (which I do know exactly). I might explore this mosaic logic a bit

gdal raster mosaic zarr/*.zarr o.vrt  --resolution 0.004167,0.004167


Size is 86393, 43197
Coordinate System is:
GEOGCRS["WGS 84",
    DATUM["World Geodetic System 1984",
        ELLIPSOID["WGS 84",6378137,298.257223563,
            LENGTHUNIT["metre",1]]],
    PRIMEM["Greenwich",0,
        ANGLEUNIT["degree",0.0174532925199433]],
    CS[ellipsoidal,2],
        AXIS["geodetic latitude (Lat)",north,
            ORDER[1],
            ANGLEUNIT["degree",0.0174532925199433]],
        AXIS["geodetic longitude (Lon)",east,
            ORDER[2],
            ANGLEUNIT["degree",0.0174532925199433]],
    ID["EPSG",4326]]
Data axis to CRS axis mapping: 2,1
Origin = (-180.000000000000000,90.000000000000000)
Pixel Size = (0.004167000000000,-0.004167000000000)
Corner Coordinates:
Upper Left  (-180.0000000,  90.0000000) (180d 0' 0.00"W, 90d 0' 0.00"N)
Lower Left  (-180.0000000, -90.0018990) (180d 0' 0.00"W, 90d 0' 6.84"S)
Upper Right ( 179.9996310,  90.0000000) (179d59'58.67"E, 90d 0' 0.00"N)
Lower Right ( 179.9996310, -90.0018990) (179d59'58.67"E, 90d 0' 6.84"S)
Center      (  -0.0001845,  -0.0009495) (  0d 0' 0.66"W,  0d 0' 3.42"S)
Band 1 Block=128x128 Type=Int16, ColorInterp=Gray
  NoData Value=-32767

@mdsumner
Copy link
Author

mdsumner commented Aug 4, 2025

Here's a python version of the convert to COG. We don't have to set outputBounds, or width/height unless we want them as something specific rather than derived from the Zarrs - probably do have to set dstSRS if it's not recorded well in the Zarrs. For creation options for COG see the format page: https://gdal.org/en/stable/drivers/raster/zarr.html#creation-options

from osgeo import gdal
from glob import glob
gdal.UseExceptions()
gdal.SetConfigOption("GDAL_NUM_THREADS", "ALL_CPUS")
zarrdirs = glob("zarr/*.zarr")
gdal.Warp("warp.tif", zarrdirs, format = "COG",
                 dstSRS = "EPSG:4326", 
                 outputBounds = [-180, -90, 180, 90], 
                 width = 86400, height = 43200, 
                 creationOptions = ["BIGTIFF=YES", "COMPRESS=ZSTD"])
                 

@mdsumner
Copy link
Author

mdsumner commented Aug 4, 2025

here's a version that uses a server for the GEBCO that is capable of intense requests (only a tiny bit slower than local!)

dsn <- "/vsicurl/https://data.source.coop/alexgleith/gebco-2024/GEBCO_2024.tif"
library(purrr) ## purrr CRAN
library(mirai) ## mirai CRAN
ds <- new(gdalraster::GDALRaster, dsn)
dm <- ds$dim()[1:2]
ex <- ds$bbox()[c(1, 3, 2, 4)]
block <-  ds$getBlockSize(1L)

g <- grout::grout(dm, ex, block * 6) ## devtols::install_github("hypertidy/grout")
idx <- grout::tile_index(g)

dir.create("zarr")
idx$lab <- sprintf("zarr/tile_%i_%i_%i.zarr", idx$tile, idx$tile_col, idx$tile_row)

mirai::daemons(0)
mirai::daemons(32)

translate <- in_parallel(function(.x) {
  b <- 1.0
  src <- "/vsicurl/https://data.source.coop/alexgleith/gebco-2024/GEBCO_2024.tif"
  
  e <- unlist(.x[, c("xmin", "xmax", "ymin", "ymax")])
  bbox <- e[c(1, 3, 2, 4)] + b * c(-1, -1, 1, 1)
  if (bbox[1] < -180) {
    bbox[1] <- -180
  }
  if (bbox[3] >180) {
    bbox[3] <- 180
  }
  if (bbox[2] < -90) {
    bbox[2] <- -90
  }
  if (bbox[4] > 90) {
    bbox[4] <- 90
  }
  
  dsn <- sprintf("vrt://GEBCO_2024.tif?projwin=%f,%f,%f,%f", bbox[1], bbox[4], bbox[3], bbox[2])
  
  if (file.exists(.x$lab)) return(NULL)
  gdalraster::translate(dsn, .x$lab, cl_arg = c("-of", "ZARR", "-co", "COMPRESS=ZSTD", "-co", "FORMAT=ZARR_V3"), quiet = TRUE)
  NULL
})
system.time(walk(split(idx, 1:nrow(idx)), translate))
## 25seconds

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment