Skip to content

Instantly share code, notes, and snippets.

@mdsumner
Last active February 17, 2025 21:17
Show Gist options
  • Save mdsumner/4ff89897e95d834073a2f49bf648707f to your computer and use it in GitHub Desktop.
Save mdsumner/4ff89897e95d834073a2f49bf648707f to your computer and use it in GitHub Desktop.

following on from https://bsky.app/profile/jerry-shannon.bsky.social/post/3lcisatdppk2h

gdalinfo /vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3
ERROR 15: No valid GCS credentials found. For authenticated requests, you need to set GS_SECRET_ACCESS_KEY, GS_ACCESS_KEY_ID, 
GS_OAUTH2_REFRESH_TOKEN, GOOGLE_APPLICATION_CREDENTIALS, or other configuration options, or create a /home/ubuntu/.boto file. Consult
https://gdal.org/en/stable/user/virtual_file_systems.html#vsigs-google-cloud-storage-files for more details. For unauthenticated requests
on public resources, set the GS_NO_SIGN_REQUEST configuration option to YES.

and so succes with

gdalinfo --config GS_NO_SIGN_REQUEST YES  /vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3
Driver: Zarr/Zarr
Files: /vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3
Size is 512, 512
Subdatasets:
  SUBDATASET_1_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/latitude
  SUBDATASET_1_DESC=Array /latitude
  SUBDATASET_2_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/level
  SUBDATASET_2_DESC=Array /level
  SUBDATASET_3_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/longitude
  SUBDATASET_3_DESC=Array /longitude
  SUBDATASET_4_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/time
  SUBDATASET_4_DESC=Array /time
  SUBDATASET_5_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/100m_u_component_of_wind
  SUBDATASET_5_DESC=Array /100m_u_component_of_wind
  SUBDATASET_6_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/100m_v_component_of_wind
  SUBDATASET_6_DESC=Array /100m_v_component_of_wind
  SUBDATASET_7_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/10m_u_component_of_neutral_wind
  SUBDATASET_7_DESC=Array /10m_u_component_of_neutral_wind
  SUBDATASET_8_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/10m_u_component_of_wind
  SUBDATASET_8_DESC=Array /10m_u_component_of_wind
  SUBDATASET_9_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/10m_v_component_of_neutral_wind
  SUBDATASET_9_DESC=Array /10m_v_component_of_neutral_wind
  SUBDATASET_10_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/10m_v_component_of_wind
  SUBDATASET_10_DESC=Array /10m_v_component_of_wind
  SUBDATASET_11_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/10m_wind_gust_since_previous_post_processing
  SUBDATASET_11_DESC=Array /10m_wind_gust_since_previous_post_processing
  SUBDATASET_12_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/2m_dewpoint_temperature
  SUBDATASET_12_DESC=Array /2m_dewpoint_temperature
  SUBDATASET_13_NAME=ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/2m_temperature
  SUBDATASET_13_DESC=Array /2m_temperature
...
@mdsumner
Copy link
Author

dsn <- "ZARR:/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3"
Sys.setenv(GS_NO_SIGN_REQUEST  = "YES")

library(stars)

read_mdim(dsn, proxy = TRUE, variable = "significant_height_of_total_swell")
stars_proxy object with 1 attribute in 1 file(s):
$significant_height_of_total_swell
[1] "[...]/full_37-1h-0p25deg-chunk-1.zarr-v3"

dimension(s):
          from      to         offset   delta  refsys x/y
longitude    1    1440         -0.125    0.25      NA [x]
latitude     1     721          90.12   -0.25      NA [y]
time         1 1323648 1900-01-01 UTC 1 hours POSIXct

@mdsumner
Copy link
Author

this does work (second band) but very slow

gdalinfo --config GS_NO_SIGN_REQUEST YES ZARR:"/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3":/significant_height_of_total_swell:{2}

GDAL_MAX_BAND_COUNT does not allow classic 2D use afaics

@mdsumner
Copy link
Author

mdsumner commented Feb 3, 2025

with multidimnew branch in mdsumner/gdalraster

library(gdalraster)  ##  'mdsumner/gdalraster/tree/multidimnew

Sys.setenv("GS_NO_SIGN_REQUEST" = "YES")
gs <- new(GDALMultiDimRaster, "/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3")
a <- gs$getArrayNames()
head(a, 12)
#  [1] "latitude"                                     "level"                                       
# [3] "longitude"                                    "time"                                        
# [5] "100m_u_component_of_wind"                     "100m_v_component_of_wind"                    
# [7] "10m_u_component_of_neutral_wind"              "10m_u_component_of_wind"                     
# [9] "10m_v_component_of_neutral_wind"              "10m_v_component_of_wind"                     
# [11] "10m_wind_gust_since_previous_post_processing" "2m_dewpoint_temperature" 

gs$getDimensionNames(a[1])
# [1] "latitude"

gs$getDimensionNames(a[6])
# [1] "time"      "latitude"  "longitude"
gs$getDimensionSizes(a[6])
# [1] 1323648     721    1440

gs$getDimensionNames("fraction_of_cloud_cover")
# [1] "time"      "level"     "latitude"  "longitude"
gs$getDimensionSizes("fraction_of_cloud_cover")
# [1] 1323648      37     721    1440

gs$close()

@mdsumner
Copy link
Author

mdsumner commented Feb 5, 2025

this works, getting DimensionNames is quite quick, not sure parallel is needed

tldr: in-dev bindings in R {gdalraster} for the GDAL multidim api, allows us to query array names and dimensions and coordinates from a remote ZARR (on goog storage), we can model an array shape and request a single time slice by "view syntax", and then we use cli to create a file from that (in future we will be able to run mdimtranslate as an api call and return lazily a dataset, or data in memory)

library(gdalraster)  ##  'mdsumner/gdalraster/tree/multidimnew
library(furrr)
library(glue)

Sys.setenv("GS_NO_SIGN_REQUEST" = "YES")
dsn <- "/vsigs/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3"
gs <- new(GDALMultiDimRaster, dsn)
a <- gs$getArrayNames()

options(parallelly.fork.enable = TRUE, future.rng.onMisuse = "ignore")

## so now we can classify the arrays by their dimension set
plan(multicore)
dimnames <- future_map(a, gs$getDimensionNames)
plan(sequential)

unique(dimnames)
# [[1]]
# [1] "latitude"
# 
# [[2]]
# [1] "level"
# 
# [[3]]
# [1] "longitude"
# 
# [[4]]
# [1] "time"
# 
# [[5]]
# [1] "time"      "latitude"  "longitude"
# 
# [[6]]
# [1] "time"      "level"     "latitude"  "longitude"


## these are the n-dimensional shapes we have 
threes <- a[which(lengths(dimnames) == 3)]
fours <- a[which(lengths(dimnames) == 4)]
gs$getDimensionSizes(threes[1])
#[1] 1323648     721    1440

gs$getDimensionSizes(fours[1])
#[1] 1323648      37     721    1440

coords <- lapply(c("longitude", "latitude", "time", "level"), gs$getCoordinateValues)


## now we can model any of the arrays
(array <- threes[1])
# "100m_u_component_of_wind"

(sizes <- gs$getDimensionSizes(array))
# [1] 1323648     721    1440

gs$close()

# for now we have to use shell command to translate (having troubles with the headers for GDALMultiDimRaster::mdimtranslate)
##  we can get a GeoTIFF because the grid is regular
timestep <- sample(seq_len(sizes[1]), 1L)
(view <- glue("name={array},view=[{timestep},:,:]"))
# name=100m_u_component_of_wind,view=[327557,:,:]

cmd <- glue("gdalmdimtranslate -of GTiff {dsn} {tf <- tempfile(fileext = \".tif\")} -array \"{view}\" ")
system(cmd)


library(terra)
plot(terra::rast(tf))

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment