With R, this is just proving to myself that I can write all the local-path references.
Ran on 128 cores on HPC it did about 3200 of these files (it's still going, there are 8000+ days since 2002 July 01. Virtualizing netcdf is insanely efficient, but the details of how to auth those remote links still beyond me (which is what I'm working towards understanding). The scratch path below on my lustre storage would be replace by the base url and its child date structure:
https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1
They look like this here without the shard structure.
dplyr::collect(p)
# A tibble: 700,000 × 4
path offset size raw
<chr> <int> <int> <arr>
1 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4… 40762 4083 NULL
2 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4… 44845 4083 NULL
3 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4… 48928 4083 NULL
4 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4… 53011 4083 NULL
5 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4… 57094 4083 NULL
6 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4… 61177 4083 NULL
7 /scratch/project/mdsumner/ghrsst/20020601090000-JPL-L4… 65260 4083 NULL
shard structure is
find parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet -type f
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/sea_ice_fraction/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/mask/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/.zmetadata
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/analysed_sst/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/time/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/lon/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/analysis_error/refs.0.parq
parquet/20080630090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.parquet/lat/refs.0.parq
## there's some unused functions here
## my earthdata 'Authorization Header: ,,,' is in ~/earthdata (need for curl to download these netcdfs)
## I use this image:
## docker run --rm -ti ghcr.io/mdsumner/gdal-builds:rocker-gdal-dev-python bash"
ghrsst_dates <- function() {
seq(as.Date("2002-06-01"), Sys.Date()-1, by = "1 day")
}
ghrsst_folder <- function(date) {
format(date-1, "%Y/%m/%d")
}
ghrsst_filedate <- function(date) {
format(date, "%Y%m%d")
}
ghrsst_baseurl <- function() {
"https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1"
}
ghrsst_file <- function(date) {
ymd <- ghrsst_filedate(date)
sprintf("%s090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc", ymd)
}
ghrsst_url <- function(date) {
sprintf("%s/%s", ghrsst_baseurl(), ghrsst_file(date))
}
ghrsst_virtual <- function(date) {
sprintf("%s/%s/%s", ghrsst_baseurl(), ghrsst_folder(date), ghrsst_file(date))
}
ghrsst_extent <- function() {
c(-179.995,180.0050,-89.995,89.995)
}
ghrsst_vsi <- function(date, sds = "analysed_sst") {
input_path <- ghrsst_url(date)
sprintf("vrt://NetCDF:\"/vsicurl/%s\":analysed_sst?a_ullr=-179.995,89.995,180.0050,-89.995&a_offset=25&a_scale=0.001&a_srs=EPSG:4326",
input_path)
}
ghrsst_key <- function(date, sds, bucket) {
file <- gsub("nc$", "tif", sprintf("%s_%s", ghrsst_file(date), sds))
file.path("/vsis3", bucket)
}
file_earthdata <- function() {
Sys.getenv("GDAL_HTTP_HEADER_FILE")
}
env_earthdata <- function() {
Sys.getenv("GDAL_HTTP_HEADERS")
}
## check for env setting for earthdata (and include a test)
file_earthdata_auth <- function() {
file <- file_earthdata()
nzchar(file) && file.exists(file)
}
env_earthdata_auth <- function() {
header <- env_earthdata()
nzchar(header)
}
has_earthdata_auth <- function() {
file_earthdata_auth() || env_earthdata_auth()
}
library(reticulate)
use_python("/workenv/bin/python3")
dofun <- function(date) {
url <- ghrsst_url(date)
file <- basename(url)
parquetfile <- file.path("parquet", gsub("nc$", "parquet", file))
if (file.exists(parquetfile)) return(NULL)
authorization <- readLines("~/earthdata")
h <- curl::new_handle()
curl::handle_setopt(h, customrequest = "GET")
curl::handle_setheaders(h, "Authorization" = gsub("Authorization: ", "", authorization))
tst <- try(curl::curl_download(url, file, handle = h))
if (inherits(tst, "try-error")) return(NULL)
virtualizarr <- import("virtualizarr")
vds1 <- virtualizarr$open_virtual_dataset(file)
vds1$virtualize$to_kerchunk(parquetfile, format='parquet')
fs::file_delete(file)
parquetfile
}
dates <- ghrsst_dates()
options(parallelly.fork.enable = TRUE, future.rng.onMisuse = "ignore")
library(furrr); plan(multicore)
a <- future_map(dates, dofun)