Skip to content

Instantly share code, notes, and snippets.

@mdsumner
Last active March 19, 2025 07:29
Show Gist options
  • Save mdsumner/1141a21ac8ca5fe78b73430a351089d0 to your computer and use it in GitHub Desktop.
Save mdsumner/1141a21ac8ca5fe78b73430a351089d0 to your computer and use it in GitHub Desktop.

With R, this is just proving to myself that I can write all the local-path references.

Ran on 128 cores on HPC it did about 3200 of these files (it's still going, there are 8000+ days since 2002 July 01. Virtualizing netcdf is insanely efficient, but the details of how to auth those remote links still beyond me (which is what I'm working towards understanding). The scratch path below on my lustre storage would be replace by the base url and its child date structure:

https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1

They look like this here without the shard structure.

dplyr::collect(p)
# A tibble: 700,000 × 4
   path                                                       offset  size   raw
   <chr>                                                       <int> <int> <arr>
 1 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4…  40762  4083  NULL
 2 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4…  44845  4083  NULL
 3 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4…  48928  4083  NULL
 4 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4…  53011  4083  NULL
 5 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4…  57094  4083  NULL
 6 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4…  61177  4083  NULL
 7 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4…  65260  4083  NULL

shard structure is

find parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet  -type f
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/sea_ice_fraction/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/mask/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/.zmetadata
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/analysed_sst/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/time/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/lon/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/analysis_error/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/lat/refs.0.parq


## there's some unused functions here
## my earthdata 'Authorization Header: ,,,' is in ~/earthdata (need for curl to download these netcdfs)
## I use this image: 
## docker run --rm -ti ghcr.io/mdsumner/gdal-builds:rocker-gdal-dev-python bash"

ghrsst_dates <- function() {
  seq(as.Date("2002-06-01"), Sys.Date()-1, by = "1 day")
}

ghrsst_folder <- function(date) {
  format(date-1, "%Y/%m/%d")
}
ghrsst_filedate <- function(date) {
  format(date, "%Y%m%d")
}
ghrsst_baseurl <- function() {
  "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1"
}
ghrsst_file <- function(date) {
  ymd <- ghrsst_filedate(date)
  sprintf("%s090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc", ymd)
}
ghrsst_url <- function(date) {
  sprintf("%s/%s", ghrsst_baseurl(), ghrsst_file(date))
}

ghrsst_virtual <- function(date) {
  sprintf("%s/%s/%s", ghrsst_baseurl(), ghrsst_folder(date), ghrsst_file(date))
}

ghrsst_extent <- function() {
  c(-179.995,180.0050,-89.995,89.995)
}

ghrsst_vsi <- function(date, sds = "analysed_sst") {
  input_path <- ghrsst_url(date)
  sprintf("vrt://NetCDF:\"/vsicurl/%s\":analysed_sst?a_ullr=-179.995,89.995,180.0050,-89.995&a_offset=25&a_scale=0.001&a_srs=EPSG:4326",
          input_path)
}

ghrsst_key <- function(date, sds, bucket) {
  file <- gsub("nc$", "tif", sprintf("%s_%s", ghrsst_file(date), sds))
  file.path("/vsis3", bucket)
}


file_earthdata <- function() {
  Sys.getenv("GDAL_HTTP_HEADER_FILE")
}

env_earthdata <- function() {
  Sys.getenv("GDAL_HTTP_HEADERS")
}
## check for env setting for earthdata (and include a test)
file_earthdata_auth <- function() {
  file <- file_earthdata()
  nzchar(file) && file.exists(file)
}
env_earthdata_auth <- function() {
  header <- env_earthdata()
  nzchar(header)
}


has_earthdata_auth <- function() {
  file_earthdata_auth() || env_earthdata_auth()
}

library(reticulate)
use_python("/workenv/bin/python3")


dofun <- function(date) {

  url <- ghrsst_url(date)
  file <- basename(url)
  parquetfile <- file.path("parquet", gsub("nc$", "parquet", file))
  if (file.exists(parquetfile)) return(NULL)

  authorization <- readLines("~/earthdata")
  h <- curl::new_handle()
  curl::handle_setopt(h,  customrequest = "GET")
  curl::handle_setheaders(h, "Authorization" = gsub("Authorization: ", "", authorization))

  tst <- try(curl::curl_download(url, file, handle = h))
  if (inherits(tst, "try-error")) return(NULL)
  virtualizarr <- import("virtualizarr")

  vds1 <- virtualizarr$open_virtual_dataset(file)

  vds1$virtualize$to_kerchunk(parquetfile, format='parquet')
  fs::file_delete(file)
  parquetfile
}


dates <- ghrsst_dates()
options(parallelly.fork.enable = TRUE, future.rng.onMisuse = "ignore")
library(furrr); plan(multicore)

a <- future_map(dates, dofun)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment