Created
February 28, 2025 08:45
-
-
Save nishadhka/bde573553cfc86cc8bca747cb253a455 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "37c14374-5c0a-4047-b7b9-980c76416b1d", | |
| "metadata": {}, | |
| "source": [ | |
| "## grib_tree reading the ecmwf file\n", | |
| "\n", | |
| "Note the Ensemble numbers are missing in the zarr/xarray dataset from datatree" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "188ab48e-6fb6-45d1-a509-1ba9a027762b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from kerchunk.grib2 import scan_grib, grib_tree\n", | |
| "date_str='20240229'\n", | |
| "ecmwf_s3url=f\"s3://ecmwf-forecasts/{date_str}/00z/ifs/0p25/enfo/{date_str}000000-0h-enfo-ef.grib2\"\n", | |
| "esc_groups = scan_grib(ecmwf_s3url)\n", | |
| "#This is scaning 4GB files, taken 5-7 minutes" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "a8205dde-fddd-46da-87c3-0134ea68f4a2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "=== ORIGINAL TREE ANALYSIS ===\n", | |
| "Total references: 757\n", | |
| "\n", | |
| "References by variable prefix:\n", | |
| " v: 54\n", | |
| " u: 54\n", | |
| " gh: 54\n", | |
| " t: 54\n", | |
| " r: 54\n", | |
| " q: 54\n", | |
| " d: 54\n", | |
| " vo: 54\n", | |
| " t2m: 30\n", | |
| " u10: 30\n", | |
| " v10: 30\n", | |
| " tp: 30\n", | |
| " msl: 30\n", | |
| " lsm: 30\n", | |
| " sp: 30\n", | |
| " skt: 30\n", | |
| " ro: 30\n", | |
| " st: 27\n", | |
| " tcwv: 27\n", | |
| " .zgroup: 1\n", | |
| "\n", | |
| "References by path depth:\n", | |
| " Level 0: 1\n", | |
| " Level 1: 38\n", | |
| " Level 2: 38\n", | |
| " Level 3: 38\n", | |
| " Level 4: 642\n", | |
| " Level 5: 0\n", | |
| "\n", | |
| "Ensemble-related references: 121\n", | |
| " Examples:\n", | |
| " t2m/instant/heightAboveGround/number/.zarray\n", | |
| " t2m/instant/heightAboveGround/number/.zattrs\n", | |
| " t2m/instant/heightAboveGround/number/0.0\n", | |
| " u10/instant/heightAboveGround/number/.zarray\n", | |
| " u10/instant/heightAboveGround/number/.zattrs\n", | |
| "\n", | |
| "=== ENSEMBLE TREE ANALYSIS ===\n", | |
| "Total references: 757\n", | |
| "\n", | |
| "References by variable prefix:\n", | |
| " v: 54\n", | |
| " u: 54\n", | |
| " gh: 54\n", | |
| " t: 54\n", | |
| " r: 54\n", | |
| " q: 54\n", | |
| " d: 54\n", | |
| " vo: 54\n", | |
| " t2m: 30\n", | |
| " u10: 30\n", | |
| " v10: 30\n", | |
| " tp: 30\n", | |
| " msl: 30\n", | |
| " lsm: 30\n", | |
| " sp: 30\n", | |
| " skt: 30\n", | |
| " ro: 30\n", | |
| " st: 27\n", | |
| " tcwv: 27\n", | |
| " .zgroup: 1\n", | |
| "\n", | |
| "References by path depth:\n", | |
| " Level 0: 1\n", | |
| " Level 1: 38\n", | |
| " Level 2: 38\n", | |
| " Level 3: 38\n", | |
| " Level 4: 642\n", | |
| " Level 5: 0\n", | |
| "\n", | |
| "Ensemble-related references: 121\n", | |
| " Examples:\n", | |
| " t2m/instant/heightAboveGround/number/.zarray\n", | |
| " t2m/instant/heightAboveGround/number/.zattrs\n", | |
| " t2m/instant/heightAboveGround/number/0.0\n", | |
| " u10/instant/heightAboveGround/number/.zarray\n", | |
| " u10/instant/heightAboveGround/number/.zattrs\n", | |
| "\n", | |
| "=== SAMPLE VARIABLE COMPARISON ===\n", | |
| "Sample variable: .zgroup\n", | |
| "Original tree paths: 0\n", | |
| "Ensemble tree paths: 0\n", | |
| "KeysView(DataTree('None', parent=None)\n", | |
| "├── DataTree('d')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Divergence\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (time: 1, step: 1, isobaricInhPa: 9, latitude: 721,\n", | |
| "│ longitude: 1440)\n", | |
| "│ Coordinates:\n", | |
| "│ * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step, isobaricInhPa) int64 72B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| "│ Data variables:\n", | |
| "│ d (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('gh')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Geopotential height\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (time: 1, step: 1, isobaricInhPa: 9, latitude: 721,\n", | |
| "│ longitude: 1440)\n", | |
| "│ Coordinates:\n", | |
| "│ * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step, isobaricInhPa) int64 72B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| "│ Data variables:\n", | |
| "│ gh (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('lsm')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Land-sea mask\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ surface float64 8B ...\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ lsm (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('msl')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Mean sea level pressure\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('meanSea')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ meanSea float64 8B ...\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ msl (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: meanSea\n", | |
| "├── DataTree('q')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Specific humidity\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (isobaricInhPa: 9, latitude: 721, longitude: 1440, time: 1,\n", | |
| "│ step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step, isobaricInhPa) int64 72B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| "│ Data variables:\n", | |
| "│ q (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('r')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Relative humidity\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (isobaricInhPa: 9, latitude: 721, longitude: 1440, time: 1,\n", | |
| "│ step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step, isobaricInhPa) int64 72B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| "│ Data variables:\n", | |
| "│ r (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('ro')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Runoff\n", | |
| "│ └── DataTree('accum')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: accum\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ surface float64 8B ...\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ ro (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('skt')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Skin temperature\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ surface float64 8B ...\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ skt (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('sp')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Surface pressure\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ surface float64 8B ...\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ sp (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('st')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Soil temperature\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('depthBelowLandLayer')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ st (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: depthBelowLandLayer\n", | |
| "├── DataTree('t')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Temperature\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (isobaricInhPa: 9, latitude: 721, longitude: 1440, time: 1,\n", | |
| "│ step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step, isobaricInhPa) int64 72B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| "│ Data variables:\n", | |
| "│ t (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('t2m')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: 2 metre temperature\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('heightAboveGround')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ heightAboveGround float64 8B ...\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ t2m (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: heightAboveGround\n", | |
| "├── DataTree('tcwv')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Total column vertically-integrated water vapour\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('entireAtmosphere')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ tcwv (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: entireAtmosphere\n", | |
| "├── DataTree('tp')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: Total precipitation\n", | |
| "│ └── DataTree('accum')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: accum\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ surface float64 8B ...\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ tp (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('u')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: U component of wind\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (isobaricInhPa: 9, latitude: 721, longitude: 1440, time: 1,\n", | |
| "│ step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step, isobaricInhPa) int64 72B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| "│ Data variables:\n", | |
| "│ u (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('u10')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: 10 metre U wind component\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('heightAboveGround')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ heightAboveGround float64 8B ...\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ u10 (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: heightAboveGround\n", | |
| "├── DataTree('v')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: V component of wind\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (isobaricInhPa: 9, latitude: 721, longitude: 1440, time: 1,\n", | |
| "│ step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ number (time, step, isobaricInhPa) int64 72B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| "│ Data variables:\n", | |
| "│ v (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('v10')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ name: 10 metre V wind component\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('heightAboveGround')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ heightAboveGround float64 8B ...\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 ... 179.5 179.8\n", | |
| "│ number (time, step) int64 8B ...\n", | |
| "│ * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ valid_time (time, step) datetime64[ns] 8B ...\n", | |
| "│ Data variables:\n", | |
| "│ v10 (time, step, latitude, longitude) float64 8MB ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: heightAboveGround\n", | |
| "└── DataTree('vo')\n", | |
| " │ Dimensions: ()\n", | |
| " │ Data variables:\n", | |
| " │ *empty*\n", | |
| " │ Attributes:\n", | |
| " │ name: Vorticity (relative)\n", | |
| " └── DataTree('instant')\n", | |
| " │ Dimensions: ()\n", | |
| " │ Data variables:\n", | |
| " │ *empty*\n", | |
| " │ Attributes:\n", | |
| " │ stepType: instant\n", | |
| " └── DataTree('isobaricInhPa')\n", | |
| " Dimensions: (isobaricInhPa: 9, latitude: 721, longitude: 1440, time: 1,\n", | |
| " step: 1)\n", | |
| " Coordinates:\n", | |
| " * isobaricInhPa (isobaricInhPa) float64 72B 50.0 200.0 250.0 ... 925.0 1e+03\n", | |
| " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| " * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| " number (time, step, isobaricInhPa) int64 72B ...\n", | |
| " * step (step) timedelta64[ns] 8B 00:00:00\n", | |
| " * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| " valid_time (time, step, isobaricInhPa) datetime64[ns] 72B ...\n", | |
| " Data variables:\n", | |
| " vo (time, step, isobaricInhPa, latitude, longitude) float64 75MB ...\n", | |
| " Attributes:\n", | |
| " typeOfLevel: isobaricInhPa)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "original_tree = grib_tree(esc_groups)\n", | |
| "# Compare the structures\n", | |
| "analysis = analyze_grib_tree_output(original_tree, original_tree)\n", | |
| "\n", | |
| "# Check if we can open with datatree\n", | |
| "gfs_dt = datatree.open_datatree(\n", | |
| " fsspec.filesystem(\"reference\", fo=original_tree).get_mapper(\"\"), \n", | |
| " engine=\"zarr\", \n", | |
| " consolidated=False\n", | |
| ")\n", | |
| "\n", | |
| "# The key test: can we access ensemble members?\n", | |
| "print(gfs_dt.keys()) # Check for variables\n", | |
| "#var_node = gfs_dt['variable_name'] # Pick one variable\n", | |
| "#print(var_node.dims) # Should include 'number' dimension if ensemble data is present" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3c8111f1-4281-4018-ae95-6239a4f33fda", | |
| "metadata": {}, | |
| "source": [ | |
| "## with the help of claude sonnet 3.7, the function is updated to have the numbers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "d3d13982-9a44-4550-986d-9617f0d44207", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import fsspec\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import copy\n", | |
| "import json\n", | |
| "import zarr\n", | |
| "from collections import defaultdict\n", | |
| "from typing import Dict, List, Iterable, Set\n", | |
| "import datatree\n", | |
| "from kerchunk.grib2 import scan_grib, grib_tree\n", | |
| "from kerchunk.combine import MultiZarrToZarr\n", | |
| "\n", | |
| "def s3_parse_ecmwf_grib_idx(\n", | |
| " fs: fsspec.AbstractFileSystem,\n", | |
| " basename: str,\n", | |
| " suffix: str = \"index\",\n", | |
| " tstamp: Optional[pd.Timestamp] = None,\n", | |
| " validate: bool = False,\n", | |
| ") -> pd.DataFrame:\n", | |
| " \"\"\"\n", | |
| " Standalone method used to extract metadata from a grib2 index file\n", | |
| "\n", | |
| " :param fs: the file system to read from\n", | |
| " :param basename: the base name is the full path to the grib file\n", | |
| " :param suffix: the suffix is the ending for the index file\n", | |
| " :param tstamp: the timestamp to record for this index process\n", | |
| " :return: the data frame containing the results\n", | |
| " \"\"\"\n", | |
| " fname = f\"{basename.rsplit('.', 1)[0]}.{suffix}\"\n", | |
| "\n", | |
| " fs.invalidate_cache(fname)\n", | |
| " fs.invalidate_cache(basename)\n", | |
| "\n", | |
| " baseinfo = fs.info(basename)\n", | |
| "\n", | |
| " with fs.open(fname, \"r\") as f:\n", | |
| " splits = []\n", | |
| " for idx, line in enumerate(f):\n", | |
| " try:\n", | |
| " # Removing the trailing characters if there's any at the end of the line\n", | |
| " clean_line = line.strip().rstrip(',')\n", | |
| " # Convert the JSON-like string to a dictionary\n", | |
| " data = json.loads(clean_line)\n", | |
| " # Extracting required fields using .get() method to handle missing keys\n", | |
| " lidx = idx\n", | |
| " offset = data.get(\"_offset\", 0) # Default to 0 if missing\n", | |
| " length = data.get(\"_length\", 0)\n", | |
| " date = data.get(\"date\", \"Unknown Date\") # Default to 'Unknown Date' if missing\n", | |
| " ens_number = data.get(\"number\", -1) # Default to -1 if missing\n", | |
| " # Append to the list as integers or the original data type\n", | |
| " splits.append([int(lidx), int(offset),int(length), date, data, int(ens_number)])\n", | |
| " except json.JSONDecodeError as e:\n", | |
| " # Handle cases where JSON conversion fails\n", | |
| " raise ValueError(f\"Could not parse JSON from line: {line}\") from e\n", | |
| "\n", | |
| " result = pd.DataFrame(splits, columns=[\"idx\", \"offset\", \"length\", \"date\", \"attr\", \"ens_number\"])\n", | |
| "\n", | |
| " # Subtract the next offset to get the length using the filesize for the last value \n", | |
| "\n", | |
| " result.loc[:, \"idx_uri\"] = fname\n", | |
| " result.loc[:, \"grib_uri\"] = basename\n", | |
| "\n", | |
| " if tstamp is None:\n", | |
| " tstamp = pd.Timestamp.now()\n", | |
| " #result.loc[:, \"indexed_at\"] = tstamp\n", | |
| " result['indexed_at'] = result.apply(lambda x: tstamp, axis=1)\n", | |
| "\n", | |
| " # Check for S3 or GCS filesystem instances to handle metadata\n", | |
| " if \"s3\" in fs.protocol:\n", | |
| " # Use ETag as the S3 equivalent to crc32c\n", | |
| " result.loc[:, \"grib_etag\"] = baseinfo.get(\"ETag\")\n", | |
| " result.loc[:, \"grib_updated_at\"] = pd.to_datetime(\n", | |
| " baseinfo.get(\"LastModified\")\n", | |
| " ).tz_localize(None)\n", | |
| "\n", | |
| " idxinfo = fs.info(fname)\n", | |
| " result.loc[:, \"idx_etag\"] = idxinfo.get(\"ETag\")\n", | |
| " result.loc[:, \"idx_updated_at\"] = pd.to_datetime(\n", | |
| " idxinfo.get(\"LastModified\")\n", | |
| " ).tz_localize(None)\n", | |
| " else:\n", | |
| " # TODO: Fix metadata for other filesystems\n", | |
| " result.loc[:, \"grib_crc32\"] = None\n", | |
| " result.loc[:, \"grib_updated_at\"] = None\n", | |
| " result.loc[:, \"idx_crc32\"] = None\n", | |
| " result.loc[:, \"idx_updated_at\"] = None\n", | |
| "\n", | |
| " if validate and not result[\"attrs\"].is_unique:\n", | |
| " raise ValueError(f\"Attribute mapping for grib file {basename} is not unique)\")\n", | |
| " print(f'Completed index files and found {len(result.index)} entries in it')\n", | |
| " return result.set_index(\"idx\")\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "def ecmwf_idx_unique_dict(edf):\n", | |
| " # Fill empty rows or missing values in 'levelist' with 'null'\n", | |
| " edf['levelist'] = edf['levelist'].fillna('null')\n", | |
| " # Filter for both pl (level 50) and sfc parameters\n", | |
| " combined_params = edf[\n", | |
| " ((edf['levtype'] == 'pl') & (edf['levelist'] == '50')) |\n", | |
| " (edf['levtype'] == 'sfc')\n", | |
| " ].groupby(['param', 'levtype', 'levelist']).agg({\n", | |
| " 'ens_number': lambda x: -1 if (-1 in x.values) else x.iloc[0]\n", | |
| " }).reset_index()\n", | |
| "\n", | |
| " combined_dict = {}\n", | |
| " for _, row in combined_params.iterrows():\n", | |
| " key = f\"{row['param']}_{row['levtype']}\"\n", | |
| " combined_dict[key] = {\n", | |
| " 'param': row['param'],\n", | |
| " 'levtype': row['levtype'],\n", | |
| " 'ens_number': row['ens_number'],\n", | |
| " 'levelist': 'null' if row['levtype'] == 'sfc' else '50'\n", | |
| " }\n", | |
| " return combined_dict\n", | |
| "\n", | |
| "\n", | |
| "def ecmwf_duplicate_dict_ens_mem(var_dict):\n", | |
| " # Generate sequence for ensemble members 1-50, with control (-1) at the start\n", | |
| " ens_numbers = np.arange(1, 51)\n", | |
| " ens_numbers = np.insert(ens_numbers, 0, -1)\n", | |
| " updated_data_dict = var_dict.copy()\n", | |
| " for ens_number in ens_numbers:\n", | |
| " for key, subdict in var_dict.items():\n", | |
| " updated_subdict = subdict.copy()\n", | |
| " updated_subdict['ens_number'] = int(ens_number)\n", | |
| " new_key = f\"{key}_ens{ens_number}\"\n", | |
| " updated_data_dict[new_key] = updated_subdict\n", | |
| " return updated_data_dict\n", | |
| "\n", | |
| "\n", | |
| "def ecmwf_get_matching_indices(filter_dict, df):\n", | |
| " # Get the indices of rows in a DataFrame that match the criteria in a dictionary.\n", | |
| " filtered_dfs = []\n", | |
| " for key, conditions in filter_dict.items():\n", | |
| " mask = True\n", | |
| " for col, value in conditions.items():\n", | |
| " if value == 'null':\n", | |
| " mask = mask & (df[col] == 'null')\n", | |
| " else:\n", | |
| " mask = mask & (df[col] == value)\n", | |
| " filtered_df = df[mask]\n", | |
| " if not filtered_df.empty:\n", | |
| " filtered_dfs.append(filtered_df)\n", | |
| " final_df = pd.concat(filtered_dfs).sort_values(['param', 'levtype'])\n", | |
| " return final_df.index.tolist()\n", | |
| "\n", | |
| "\n", | |
| "def ecmwf_idx_df_create_with_keys(ecmwf_s3url):\n", | |
| " fs = fsspec.filesystem(\"s3\")\n", | |
| " suffix = 'index'\n", | |
| " idx_file_index = s3_parse_ecmwf_grib_idx(fs=fs, basename=ecmwf_s3url, suffix=suffix)\n", | |
| " edf = pd.concat([\n", | |
| " idx_file_index.drop('attr', axis=1),\n", | |
| " idx_file_index['attr'].apply(pd.Series)\n", | |
| " ], axis=1)\n", | |
| " combined_dict = ecmwf_idx_unique_dict(edf)\n", | |
| " all_em = ecmwf_duplicate_dict_ens_mem(combined_dict)\n", | |
| " idx_mapping = {}\n", | |
| " for ens_key, conditions in all_em.items():\n", | |
| " mask = True\n", | |
| " for col, value in conditions.items():\n", | |
| " if value == 'null':\n", | |
| " mask = mask & (edf[col] == 'null')\n", | |
| " else:\n", | |
| " mask = mask & (edf[col] == value)\n", | |
| " matching_indices = edf[mask].index.tolist()\n", | |
| " for idx in matching_indices:\n", | |
| " idx_mapping[idx] = ens_key\n", | |
| " return idx_mapping, combined_dict\n", | |
| "\n", | |
| "\n", | |
| "def ecmwf_filter_scan_grib(ecmwf_s3url):\n", | |
| " \"\"\"\n", | |
| " Scan an ECMWF GRIB file, add ensemble information to the Zarr references,\n", | |
| " and return a list of modified groups along with an index mapping.\n", | |
| " \"\"\"\n", | |
| " esc_groups = scan_grib(ecmwf_s3url)\n", | |
| " print(f\"Completed scan_grib for {ecmwf_s3url}, found {len(esc_groups)} messages\")\n", | |
| " idx_mapping, _ = ecmwf_idx_df_create_with_keys(ecmwf_s3url)\n", | |
| " print(f\"Found {len(idx_mapping)} matching indices\")\n", | |
| " modified_groups = []\n", | |
| " for i, group in enumerate(esc_groups):\n", | |
| " if i in idx_mapping:\n", | |
| " ens_key = idx_mapping[i]\n", | |
| " ens_number = int(ens_key.split('ens')[-1]) if 'ens' in ens_key else -1\n", | |
| " mod_group = copy.deepcopy(group)\n", | |
| " refs = mod_group['refs']\n", | |
| " data_vars = []\n", | |
| " for key in refs:\n", | |
| " if key.endswith('/.zattrs'):\n", | |
| " var_name = key.split('/')[0]\n", | |
| " if not var_name.startswith('.'):\n", | |
| " try:\n", | |
| " attrs = json.loads(refs[key])\n", | |
| " if '_ARRAY_DIMENSIONS' in attrs and len(attrs['_ARRAY_DIMENSIONS']) > 0:\n", | |
| " if var_name not in ['latitude', 'longitude', 'number', 'time', 'step', 'valid_time']:\n", | |
| " data_vars.append(var_name)\n", | |
| " except json.JSONDecodeError:\n", | |
| " print(f\"Error decoding {key}\")\n", | |
| " if '.zattrs' in refs:\n", | |
| " try:\n", | |
| " root_attrs = json.loads(refs['.zattrs'])\n", | |
| " root_attrs['ensemble_member'] = ens_number\n", | |
| " root_attrs['ensemble_key'] = ens_key\n", | |
| " if 'coordinates' in root_attrs:\n", | |
| " coords = root_attrs['coordinates'].split()\n", | |
| " if 'number' not in coords:\n", | |
| " coords.append('number')\n", | |
| " root_attrs['coordinates'] = ' '.join(coords)\n", | |
| " refs['.zattrs'] = json.dumps(root_attrs)\n", | |
| " except json.JSONDecodeError:\n", | |
| " print(f\"Error updating root attributes for group {i}\")\n", | |
| " for var_name in data_vars:\n", | |
| " attr_key = f\"{var_name}/.zattrs\"\n", | |
| " if attr_key in refs:\n", | |
| " try:\n", | |
| " var_attrs = json.loads(refs[attr_key])\n", | |
| " var_attrs['ensemble_member'] = ens_number\n", | |
| " var_attrs['ensemble_key'] = ens_key\n", | |
| " refs[attr_key] = json.dumps(var_attrs)\n", | |
| " except json.JSONDecodeError:\n", | |
| " print(f\"Error updating attributes for {var_name}\")\n", | |
| " has_number = any(key == 'number/.zattrs' or key.endswith('/number/.zattrs') for key in refs)\n", | |
| " if not has_number:\n", | |
| " print(f\"Adding number coordinate for group {i}, ensemble {ens_number}\")\n", | |
| " refs['number/.zarray'] = json.dumps({\n", | |
| " \"chunks\": [],\n", | |
| " \"compressor\": None,\n", | |
| " \"dtype\": \"<i8\",\n", | |
| " \"fill_value\": None,\n", | |
| " \"filters\": None,\n", | |
| " \"order\": \"C\",\n", | |
| " \"shape\": [],\n", | |
| " \"zarr_format\": 2\n", | |
| " })\n", | |
| " refs['number/.zattrs'] = json.dumps({\n", | |
| " \"_ARRAY_DIMENSIONS\": [],\n", | |
| " \"long_name\": \"ensemble member numerical id\",\n", | |
| " \"standard_name\": \"realization\",\n", | |
| " \"units\": \"1\"\n", | |
| " })\n", | |
| " ens_num_array = np.array(ens_number, dtype=np.int64)\n", | |
| " refs['number/0'] = ens_num_array.tobytes().decode('latin1')\n", | |
| " modified_groups.append(mod_group)\n", | |
| " return modified_groups, idx_mapping\n", | |
| "\n", | |
| "\n", | |
| "def organize_ensemble_tree(original_tree):\n", | |
| " \"\"\"\n", | |
| " Reorganize the original Zarr tree by adding ensemble dimensions to the attributes.\n", | |
| " \"\"\"\n", | |
| " ensemble_tree = copy.deepcopy(original_tree)\n", | |
| " attrs_keys = [k for k in ensemble_tree['refs'] if k.endswith('/.zattrs')]\n", | |
| " for key in attrs_keys:\n", | |
| " try:\n", | |
| " attrs = json.loads(ensemble_tree['refs'][key])\n", | |
| " if '_ARRAY_DIMENSIONS' in attrs:\n", | |
| " if 'number' not in attrs['_ARRAY_DIMENSIONS']:\n", | |
| " dimensions = attrs['_ARRAY_DIMENSIONS']\n", | |
| " if 'time' in dimensions and 'step' in dimensions:\n", | |
| " step_index = dimensions.index('step')\n", | |
| " dimensions.insert(step_index + 1, 'number')\n", | |
| " else:\n", | |
| " dimensions.insert(0, 'number')\n", | |
| " attrs['_ARRAY_DIMENSIONS'] = dimensions\n", | |
| " ensemble_tree['refs'][key] = json.dumps(attrs)\n", | |
| " if key == '.zattrs':\n", | |
| " if 'coordinates' in attrs:\n", | |
| " coords = attrs['coordinates'].split()\n", | |
| " if 'number' not in coords:\n", | |
| " coords.append('number')\n", | |
| " attrs['coordinates'] = ' '.join(coords)\n", | |
| " ensemble_tree['refs'][key] = json.dumps(attrs)\n", | |
| " except json.JSONDecodeError:\n", | |
| " print(f\"Error parsing attributes for {key}\")\n", | |
| " if 'number/.zattrs' not in ensemble_tree['refs']:\n", | |
| " ensemble_tree['refs']['number/.zattrs'] = json.dumps({\n", | |
| " \"_ARRAY_DIMENSIONS\": [],\n", | |
| " \"long_name\": \"ensemble member numerical id\",\n", | |
| " \"standard_name\": \"realization\",\n", | |
| " \"units\": \"1\"\n", | |
| " })\n", | |
| " ensemble_tree['refs']['number/.zarray'] = json.dumps({\n", | |
| " \"chunks\": [51],\n", | |
| " \"compressor\": None,\n", | |
| " \"dtype\": \"<i8\",\n", | |
| " \"fill_value\": None,\n", | |
| " \"filters\": None,\n", | |
| " \"order\": \"C\",\n", | |
| " \"shape\": [51],\n", | |
| " \"zarr_format\": 2\n", | |
| " })\n", | |
| " numbers = np.arange(-1, 50, dtype=np.int64)\n", | |
| " ensemble_tree['refs']['number/0'] = numbers.tobytes().decode('latin1')\n", | |
| " array_keys = [k for k in ensemble_tree['refs'] if k.endswith('/.zarray')]\n", | |
| " for key in array_keys:\n", | |
| " try:\n", | |
| " var_name = key.split('/')[0]\n", | |
| " if var_name in ['latitude', 'longitude', 'time', 'step', 'valid_time', 'number']:\n", | |
| " continue\n", | |
| " attr_key = key.replace('/.zarray', '/.zattrs')\n", | |
| " if attr_key in ensemble_tree['refs']:\n", | |
| " attrs = json.loads(ensemble_tree['refs'][attr_key])\n", | |
| " if '_ARRAY_DIMENSIONS' in attrs and 'number' in attrs['_ARRAY_DIMENSIONS']:\n", | |
| " array_def = json.loads(ensemble_tree['refs'][key])\n", | |
| " number_index = attrs['_ARRAY_DIMENSIONS'].index('number')\n", | |
| " shape = array_def['shape']\n", | |
| " if number_index < len(shape):\n", | |
| " shape.insert(number_index, 51)\n", | |
| " elif number_index == len(shape):\n", | |
| " shape.append(51)\n", | |
| " if 'chunks' in array_def and array_def['chunks']:\n", | |
| " chunks = array_def['chunks']\n", | |
| " if number_index < len(chunks):\n", | |
| " chunks.insert(number_index, 51)\n", | |
| " elif number_index == len(chunks):\n", | |
| " chunks.append(51)\n", | |
| " ensemble_tree['refs'][key] = json.dumps(array_def)\n", | |
| " except json.JSONDecodeError:\n", | |
| " print(f\"Error parsing array definition for {key}\")\n", | |
| " return ensemble_tree\n", | |
| "\n", | |
| "def fixed_ensemble_grib_tree(\n", | |
| " message_groups: Iterable[Dict],\n", | |
| " remote_options=None,\n", | |
| " debug_output=False\n", | |
| ") -> Dict:\n", | |
| " \"\"\"\n", | |
| " Build a hierarchical data model from a set of scanned grib messages with proper ensemble support\n", | |
| " and correct zarr path structure.\n", | |
| " \n", | |
| " This function handles ensemble dimensions correctly while maintaining the proper zarr structure\n", | |
| " needed by datatree.\n", | |
| " \n", | |
| " Parameters\n", | |
| " ----------\n", | |
| " message_groups: iterable[dict]\n", | |
| " a collection of zarr store like dictionaries as produced by scan_grib\n", | |
| " remote_options: dict\n", | |
| " remote options to pass to MultiZarrToZarr\n", | |
| " debug_output: bool\n", | |
| " If True, prints detailed debugging information\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " dict: A zarr store like dictionary with proper ensemble support\n", | |
| " \"\"\"\n", | |
| " # Hard code the filters in the correct order for the group hierarchy\n", | |
| " filters = [\"stepType\", \"typeOfLevel\"]\n", | |
| "\n", | |
| " # Use a regular dictionary for storage\n", | |
| " zarr_store = {'.zgroup': json.dumps({'zarr_format': 2})}\n", | |
| " zroot = zarr.group()\n", | |
| " \n", | |
| " # Track information by path\n", | |
| " aggregations = defaultdict(list)\n", | |
| " ensemble_dimensions = defaultdict(set)\n", | |
| " level_dimensions = defaultdict(set)\n", | |
| " path_counts = defaultdict(int)\n", | |
| "\n", | |
| " # Process each message group and determine paths\n", | |
| " for msg_ind, group in enumerate(message_groups):\n", | |
| " if \"version\" not in group or group[\"version\"] != 1:\n", | |
| " if debug_output:\n", | |
| " print(f\"Skipping message {msg_ind}: Invalid version\")\n", | |
| " continue\n", | |
| "\n", | |
| " # Extract ensemble member information\n", | |
| " ensemble_member = None\n", | |
| " try:\n", | |
| " # Check various potential locations for ensemble info\n", | |
| " if \".zattrs\" in group[\"refs\"]:\n", | |
| " root_attrs = json.loads(group[\"refs\"][\".zattrs\"])\n", | |
| " if \"ensemble_member\" in root_attrs:\n", | |
| " ensemble_member = root_attrs[\"ensemble_member\"]\n", | |
| " \n", | |
| " # Look for number variable which typically holds ensemble number\n", | |
| " if ensemble_member is None:\n", | |
| " for key in group[\"refs\"]:\n", | |
| " if key == \"number/0\" or key.endswith(\"/number/0\"):\n", | |
| " val = group[\"refs\"][key]\n", | |
| " if isinstance(val, str):\n", | |
| " try:\n", | |
| " arr = np.frombuffer(val.encode('latin1'), dtype=np.int64)\n", | |
| " if len(arr) == 1:\n", | |
| " ensemble_member = int(arr[0])\n", | |
| " break\n", | |
| " except:\n", | |
| " pass\n", | |
| " except Exception as e:\n", | |
| " if debug_output:\n", | |
| " print(f\"Warning: Error extracting ensemble information for msg {msg_ind}: {e}\")\n", | |
| " \n", | |
| " # Try to extract coordinates from the root attributes\n", | |
| " try:\n", | |
| " gattrs = json.loads(group[\"refs\"][\".zattrs\"])\n", | |
| " coordinates = gattrs[\"coordinates\"].split(\" \")\n", | |
| " except Exception as e:\n", | |
| " if debug_output:\n", | |
| " print(f\"Warning: Issue with attributes for message {msg_ind}: {e}\")\n", | |
| " continue\n", | |
| "\n", | |
| " # Find the data variable\n", | |
| " vname = None\n", | |
| " for key in group[\"refs\"]:\n", | |
| " name = key.split(\"/\")[0]\n", | |
| " if name not in [\".zattrs\", \".zgroup\"] and name not in coordinates:\n", | |
| " vname = name\n", | |
| " break\n", | |
| "\n", | |
| " if vname is None or vname == \"unknown\":\n", | |
| " if debug_output:\n", | |
| " print(f\"Warning: No valid data variable found for message {msg_ind}\")\n", | |
| " continue\n", | |
| "\n", | |
| " # Extract attributes for this variable\n", | |
| " try:\n", | |
| " dattrs = json.loads(group[\"refs\"][f\"{vname}/.zattrs\"])\n", | |
| " except Exception as e:\n", | |
| " if debug_output:\n", | |
| " print(f\"Warning: Issue with variable attributes for {vname} in message {msg_ind}: {e}\")\n", | |
| " continue\n", | |
| "\n", | |
| " # Build path based on filter attributes\n", | |
| " gfilters = {}\n", | |
| " for key in filters:\n", | |
| " attr_val = dattrs.get(f\"GRIB_{key}\")\n", | |
| " if attr_val and attr_val != \"unknown\":\n", | |
| " gfilters[key] = attr_val\n", | |
| "\n", | |
| " # Start with variable name\n", | |
| " path_parts = [vname]\n", | |
| " \n", | |
| " # Add filter values to path\n", | |
| " for key, value in gfilters.items():\n", | |
| " if value:\n", | |
| " path_parts.append(value)\n", | |
| " \n", | |
| " # The base path excludes ensemble information\n", | |
| " base_path = \"/\".join(path_parts)\n", | |
| " \n", | |
| " # Add group to aggregations\n", | |
| " group_copy = copy.deepcopy(group)\n", | |
| " if ensemble_member is not None:\n", | |
| " group_copy[\"ensemble_member\"] = ensemble_member\n", | |
| " \n", | |
| " aggregations[base_path].append(group_copy)\n", | |
| " path_counts[base_path] += 1\n", | |
| " \n", | |
| " # Track ensemble dimension\n", | |
| " if ensemble_member is not None:\n", | |
| " ensemble_dimensions[base_path].add(ensemble_member)\n", | |
| " \n", | |
| " # Track level information\n", | |
| " for key, entry in group[\"refs\"].items():\n", | |
| " name = key.split(\"/\")[0]\n", | |
| " if name == gfilters.get(\"typeOfLevel\") and key.endswith(\"0\"):\n", | |
| " if isinstance(entry, list):\n", | |
| " entry = tuple(entry)\n", | |
| " level_dimensions[base_path].add(entry)\n", | |
| " \n", | |
| " # Print diagnostics for paths if debug is enabled\n", | |
| " if debug_output:\n", | |
| " print(f\"Found {len(aggregations)} unique paths from {len(message_groups)} messages\")\n", | |
| " for path, groups in sorted(aggregations.items(), key=lambda x: len(x[1]), reverse=True):\n", | |
| " ensemble_count = len(ensemble_dimensions.get(path, set()))\n", | |
| " level_count = len(level_dimensions.get(path, set()))\n", | |
| " print(f\" {path}: {len(groups)} groups, {ensemble_count} ensemble members, {level_count} levels\")\n", | |
| " \n", | |
| " # Process each path with MultiZarrToZarr and ensure proper hierarchical structure\n", | |
| " for path, groups in aggregations.items():\n", | |
| " # Build groups for each level in the hierarchy\n", | |
| " path_parts = path.split(\"/\")\n", | |
| " current_path = \"\"\n", | |
| " for i, part in enumerate(path_parts):\n", | |
| " prev_path = current_path\n", | |
| " \n", | |
| " if current_path:\n", | |
| " current_path = f\"{current_path}/{part}\"\n", | |
| " else:\n", | |
| " current_path = part\n", | |
| " \n", | |
| " # Add .zgroup for this level if not already present\n", | |
| " if f\"{current_path}/.zgroup\" not in zarr_store:\n", | |
| " zarr_store[f\"{current_path}/.zgroup\"] = json.dumps({'zarr_format': 2})\n", | |
| " \n", | |
| " # Add .zattrs for this level\n", | |
| " if f\"{current_path}/.zattrs\" not in zarr_store:\n", | |
| " # Add appropriate attributes based on the level\n", | |
| " attrs = {}\n", | |
| " \n", | |
| " # Add filter-specific attributes\n", | |
| " if i == 1 and len(path_parts) > 1: # stepType level\n", | |
| " attrs[\"stepType\"] = path_parts[i]\n", | |
| " if i == 2 and len(path_parts) > 2: # typeOfLevel level\n", | |
| " attrs[\"typeOfLevel\"] = path_parts[i]\n", | |
| " \n", | |
| " zarr_store[f\"{current_path}/.zattrs\"] = json.dumps(attrs)\n", | |
| " \n", | |
| " # Get dimensions for this path\n", | |
| " catdims = [\"time\", \"step\"] # Always concatenate time and step\n", | |
| " idims = [\"longitude\", \"latitude\"] # Latitude and longitude are always identical\n", | |
| " \n", | |
| " # Handle level dimensions\n", | |
| " level_count = len(level_dimensions.get(path, set()))\n", | |
| " level_name = path_parts[-1] if len(path_parts) > 0 else None\n", | |
| " \n", | |
| " if level_count == 1:\n", | |
| " # Single level - treat as identical dimension\n", | |
| " if level_name and level_name not in idims:\n", | |
| " idims.append(level_name)\n", | |
| " elif level_count > 1:\n", | |
| " # Multiple levels - treat as concat dimension\n", | |
| " if level_name and level_name not in catdims:\n", | |
| " catdims.append(level_name)\n", | |
| " \n", | |
| " # Handle ensemble dimension\n", | |
| " ensemble_count = len(ensemble_dimensions.get(path, set()))\n", | |
| " if ensemble_count > 1 and \"number\" not in catdims:\n", | |
| " catdims.append(\"number\")\n", | |
| " # Sort groups by ensemble number for consistent processing\n", | |
| " groups.sort(key=lambda g: g.get(\"ensemble_member\", 0))\n", | |
| " \n", | |
| " if debug_output:\n", | |
| " print(f\"Processing {path} with concat_dims={catdims}, identical_dims={idims}\")\n", | |
| " \n", | |
| " try:\n", | |
| " # Create aggregation\n", | |
| " mzz = MultiZarrToZarr(\n", | |
| " groups,\n", | |
| " remote_options=remote_options,\n", | |
| " concat_dims=catdims,\n", | |
| " identical_dims=idims,\n", | |
| " )\n", | |
| " \n", | |
| " # Get result and store references\n", | |
| " group_result = mzz.translate()\n", | |
| " \n", | |
| " # Add each reference with proper path prefix\n", | |
| " for key, value in group_result[\"refs\"].items():\n", | |
| " if key == \".zattrs\" or key == \".zgroup\":\n", | |
| " # Don't overwrite existing group metadata\n", | |
| " if f\"{path}/{key}\" not in zarr_store:\n", | |
| " zarr_store[f\"{path}/{key}\"] = value\n", | |
| " else:\n", | |
| " # Data or other references\n", | |
| " zarr_store[f\"{path}/{key}\"] = value\n", | |
| " \n", | |
| " except Exception as e:\n", | |
| " if debug_output:\n", | |
| " print(f\"Error processing path {path}: {e}\")\n", | |
| " import traceback\n", | |
| " traceback.print_exc()\n", | |
| " \n", | |
| " # Convert all byte values to strings for compatibility\n", | |
| " zarr_store = {\n", | |
| " key: (val.decode('utf-8') if isinstance(val, bytes) else val)\n", | |
| " for key, val in zarr_store.items()\n", | |
| " }\n", | |
| " \n", | |
| " return {\n", | |
| " \"refs\": zarr_store,\n", | |
| " \"version\": 1\n", | |
| " }\n", | |
| "\n", | |
| "\n", | |
| "def analyze_grib_tree_output(original_tree, ensembe_tree):\n", | |
| " \"\"\"\n", | |
| " Analyze and compare outputs from different grib_tree functions\n", | |
| " \"\"\"\n", | |
| " # Count references by path prefix\n", | |
| " def count_by_prefix(refs_dict):\n", | |
| " prefix_counts = {}\n", | |
| " for key in refs_dict:\n", | |
| " # Extract first part of the path\n", | |
| " parts = key.split('/')\n", | |
| " if len(parts) > 0:\n", | |
| " prefix = parts[0]\n", | |
| " if prefix not in prefix_counts:\n", | |
| " prefix_counts[prefix] = 0\n", | |
| " prefix_counts[prefix] += 1\n", | |
| " \n", | |
| " return prefix_counts\n", | |
| " \n", | |
| " # Count references by group level\n", | |
| " def count_by_level(refs_dict):\n", | |
| " level_counts = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}\n", | |
| " for key in refs_dict:\n", | |
| " # Count levels in the path\n", | |
| " level = key.count('/')\n", | |
| " if level in level_counts:\n", | |
| " level_counts[level] += 1\n", | |
| " else:\n", | |
| " level_counts[5] += 1 # Group anything deeper than level 4\n", | |
| " \n", | |
| " return level_counts\n", | |
| " \n", | |
| " # Look for ensemble-related entries\n", | |
| " def find_ensemble_refs(refs_dict):\n", | |
| " ensemble_refs = []\n", | |
| " for key in refs_dict:\n", | |
| " if 'number' in key or 'ensemble' in key:\n", | |
| " ensemble_refs.append(key)\n", | |
| " \n", | |
| " return ensemble_refs\n", | |
| "\n", | |
| " # Analyze original tree\n", | |
| " orig_prefix_counts = count_by_prefix(original_tree['refs'])\n", | |
| " orig_level_counts = count_by_level(original_tree['refs'])\n", | |
| " orig_ensemble_refs = find_ensemble_refs(original_tree['refs'])\n", | |
| " \n", | |
| " # Analyze ensemble tree\n", | |
| " ens_prefix_counts = count_by_prefix(ensembe_tree['refs'])\n", | |
| " ens_level_counts = count_by_level(ensembe_tree['refs'])\n", | |
| " ens_ensemble_refs = find_ensemble_refs(ensembe_tree['refs'])\n", | |
| " \n", | |
| " # Print analysis\n", | |
| " print(\"=== ORIGINAL TREE ANALYSIS ===\")\n", | |
| " print(f\"Total references: {len(original_tree['refs'])}\")\n", | |
| " print(\"\\nReferences by variable prefix:\")\n", | |
| " for prefix, count in sorted(orig_prefix_counts.items(), key=lambda x: x[1], reverse=True):\n", | |
| " print(f\" {prefix}: {count}\")\n", | |
| " \n", | |
| " print(\"\\nReferences by path depth:\")\n", | |
| " for level, count in orig_level_counts.items():\n", | |
| " print(f\" Level {level}: {count}\")\n", | |
| " \n", | |
| " print(f\"\\nEnsemble-related references: {len(orig_ensemble_refs)}\")\n", | |
| " if orig_ensemble_refs:\n", | |
| " print(\" Examples:\")\n", | |
| " for ref in orig_ensemble_refs[:5]: # Show up to 5 examples\n", | |
| " print(f\" {ref}\")\n", | |
| " \n", | |
| " print(\"\\n=== ENSEMBLE TREE ANALYSIS ===\")\n", | |
| " print(f\"Total references: {len(ensembe_tree['refs'])}\")\n", | |
| " print(\"\\nReferences by variable prefix:\")\n", | |
| " for prefix, count in sorted(ens_prefix_counts.items(), key=lambda x: x[1], reverse=True):\n", | |
| " print(f\" {prefix}: {count}\")\n", | |
| " \n", | |
| " print(\"\\nReferences by path depth:\")\n", | |
| " for level, count in ens_level_counts.items():\n", | |
| " print(f\" Level {level}: {count}\")\n", | |
| " \n", | |
| " print(f\"\\nEnsemble-related references: {len(ens_ensemble_refs)}\")\n", | |
| " if ens_ensemble_refs:\n", | |
| " print(\" Examples:\")\n", | |
| " for ref in ens_ensemble_refs[:5]: # Show up to 5 examples\n", | |
| " print(f\" {ref}\")\n", | |
| " \n", | |
| " # Compare structure of a sample variable\n", | |
| " print(\"\\n=== SAMPLE VARIABLE COMPARISON ===\")\n", | |
| " # Find a common variable prefix\n", | |
| " common_prefixes = set(orig_prefix_counts.keys()) & set(ens_prefix_counts.keys())\n", | |
| " if common_prefixes:\n", | |
| " sample_var = next(iter(common_prefixes))\n", | |
| " print(f\"Sample variable: {sample_var}\")\n", | |
| " \n", | |
| " # Get all paths for this variable\n", | |
| " orig_var_paths = [p for p in original_tree['refs'] if p.startswith(f\"{sample_var}/\")]\n", | |
| " ens_var_paths = [p for p in ensembe_tree['refs'] if p.startswith(f\"{sample_var}/\")]\n", | |
| " \n", | |
| " print(f\"Original tree paths: {len(orig_var_paths)}\")\n", | |
| " for path in sorted(orig_var_paths)[:5]: # Show up to 5 examples\n", | |
| " print(f\" {path}\")\n", | |
| " \n", | |
| " print(f\"Ensemble tree paths: {len(ens_var_paths)}\")\n", | |
| " for path in sorted(ens_var_paths)[:5]: # Show up to 5 examples\n", | |
| " print(f\" {path}\")\n", | |
| " \n", | |
| " return {\n", | |
| " \"original\": {\n", | |
| " \"total\": len(original_tree['refs']),\n", | |
| " \"by_prefix\": orig_prefix_counts,\n", | |
| " \"by_level\": orig_level_counts,\n", | |
| " \"ensemble_refs\": len(orig_ensemble_refs)\n", | |
| " },\n", | |
| " \"ensemble\": {\n", | |
| " \"total\": len(ensembe_tree['refs']),\n", | |
| " \"by_prefix\": ens_prefix_counts,\n", | |
| " \"by_level\": ens_level_counts,\n", | |
| " \"ensemble_refs\": len(ens_ensemble_refs)\n", | |
| " }\n", | |
| " }\n", | |
| "\n", | |
| "\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "7af237f9-5c0e-4483-b0a3-ad947b60aa7c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Completed scan_grib for s3://ecmwf-forecasts/20240229/00z/ifs/0p25/enfo/20240229000000-0h-enfo-ef.grib2, found 4233 messages\n", | |
| "Completed index files and found 4233 entries in it\n", | |
| "Found 969 matching indices\n", | |
| "Completed scan_grib for s3://ecmwf-forecasts/20240229/00z/ifs/0p25/enfo/20240229000000-3h-enfo-ef.grib2, found 4233 messages\n", | |
| "Completed index files and found 4233 entries in it\n", | |
| "Found 969 matching indices\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/opt/conda/lib/python3.12/site-packages/kerchunk/combine.py:370: UserWarning: Concatenated coordinate 'time' contains less than expectednumber of values across the datasets: [1709164800]\n", | |
| " warnings.warn(\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Built original tree with 622 references\n", | |
| "Created ensemble tree with 625 references\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ===== Final execution starts here =====\n", | |
| "\n", | |
| "date_str = '20240229'\n", | |
| "ecmwf_files = [\n", | |
| " f\"s3://ecmwf-forecasts/{date_str}/00z/ifs/0p25/enfo/{date_str}000000-0h-enfo-ef.grib2\",\n", | |
| " f\"s3://ecmwf-forecasts/{date_str}/00z/ifs/0p25/enfo/{date_str}000000-3h-enfo-ef.grib2\"\n", | |
| "]\n", | |
| "\n", | |
| "all_groups = []\n", | |
| "for eurl in ecmwf_files:\n", | |
| " try:\n", | |
| " groups, idx_mapping = ecmwf_filter_scan_grib(eurl)\n", | |
| " all_groups.extend(groups)\n", | |
| " except Exception as e:\n", | |
| " print(f\"Error processing {eurl}: {e}\")\n", | |
| " import traceback\n", | |
| " traceback.print_exc()\n", | |
| "\n", | |
| "if not all_groups:\n", | |
| " raise ValueError(\"No valid groups were found\")\n", | |
| "\n", | |
| "try:\n", | |
| " # Build the original tree using kerchunk's grib_tree\n", | |
| " original_tree = grib_tree(all_groups)\n", | |
| " print(f\"Built original tree with {len(original_tree['refs'])} references\")\n", | |
| " modified_tree = organize_ensemble_tree(original_tree)\n", | |
| " print(f\"Created ensemble tree with {len(modified_tree['refs'])} references\")\n", | |
| "except Exception as e:\n", | |
| " print(f\"Error building trees: {e}\")\n", | |
| " import traceback\n", | |
| " traceback.print_exc()\n", | |
| " raise\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "a9fcd49b-b792-4af6-b474-6c368398b3e5", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Found 19 unique paths from 1938 messages\n", | |
| " t2m/instant/heightAboveGround: 102 groups, 51 ensemble members, 1 levels\n", | |
| " u10/instant/heightAboveGround: 102 groups, 51 ensemble members, 1 levels\n", | |
| " v10/instant/heightAboveGround: 102 groups, 51 ensemble members, 1 levels\n", | |
| " tp/accum/surface: 102 groups, 51 ensemble members, 1 levels\n", | |
| " msl/instant/meanSea: 102 groups, 51 ensemble members, 1 levels\n", | |
| " lsm/instant/surface: 102 groups, 51 ensemble members, 1 levels\n", | |
| " sp/instant/surface: 102 groups, 51 ensemble members, 1 levels\n", | |
| " st/instant/depthBelowLandLayer: 102 groups, 51 ensemble members, 0 levels\n", | |
| " v/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| " u/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| " skt/instant/surface: 102 groups, 51 ensemble members, 1 levels\n", | |
| " gh/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| " t/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| " q/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| " r/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| " tcwv/instant/entireAtmosphere: 102 groups, 51 ensemble members, 0 levels\n", | |
| " d/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| " ro/accum/surface: 102 groups, 51 ensemble members, 1 levels\n", | |
| " vo/instant/isobaricInhPa: 102 groups, 51 ensemble members, 1 levels\n", | |
| "Processing t2m/instant/heightAboveGround with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'heightAboveGround']\n", | |
| "Processing u10/instant/heightAboveGround with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'heightAboveGround']\n", | |
| "Processing v10/instant/heightAboveGround with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'heightAboveGround']\n", | |
| "Processing tp/accum/surface with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'surface']\n", | |
| "Processing msl/instant/meanSea with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'meanSea']\n", | |
| "Processing lsm/instant/surface with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'surface']\n", | |
| "Processing sp/instant/surface with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'surface']\n", | |
| "Processing st/instant/depthBelowLandLayer with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude']\n", | |
| "Processing v/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Processing u/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Processing skt/instant/surface with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'surface']\n", | |
| "Processing gh/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Processing t/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Processing q/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Processing r/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Processing tcwv/instant/entireAtmosphere with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude']\n", | |
| "Processing d/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Processing ro/accum/surface with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'surface']\n", | |
| "Processing vo/instant/isobaricInhPa with concat_dims=['time', 'step', 'number'], identical_dims=['longitude', 'latitude', 'isobaricInhPa']\n", | |
| "Total refs: 4403\n", | |
| "['.zgroup', 't2m/.zgroup', 't2m/.zattrs', 'u10/.zgroup', 'u10/.zattrs', 'v10/.zgroup', 'v10/.zattrs', 'tp/.zgroup', 'tp/.zattrs', 'msl/.zgroup']\n", | |
| "KeysView(DataTree('None', parent=None)\n", | |
| "├── DataTree('d')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (time: 1, step: 2, number: 51, latitude: 721, longitude: 1440)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ d (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ isobaricInhPa float64 8B ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('gh')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (time: 1, step: 2, number: 51, latitude: 721, longitude: 1440)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ gh (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ isobaricInhPa float64 8B ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('lsm')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 2, number: 51)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ lsm (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ surface float64 8B ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('msl')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('meanSea')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, time: 1, step: 2, number: 51)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ meanSea float64 8B ...\n", | |
| "│ msl (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: meanSea\n", | |
| "├── DataTree('q')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, time: 1, step: 2)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ isobaricInhPa float64 8B ...\n", | |
| "│ q (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('r')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, time: 1, step: 2)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ isobaricInhPa float64 8B ...\n", | |
| "│ r (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('ro')\n", | |
| "│ └── DataTree('accum')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: accum\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, time: 1, step: 2)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ ro (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ surface float64 8B ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('skt')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, time: 1, step: 2)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ skt (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ surface float64 8B ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('sp')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, time: 1, step: 2)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ sp (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ surface float64 8B ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('st')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('depthBelowLandLayer')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, time: 1, step: 2)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ st (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: depthBelowLandLayer\n", | |
| "├── DataTree('t')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2, time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ isobaricInhPa float64 8B ...\n", | |
| "│ t (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('t2m')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('heightAboveGround')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2,\n", | |
| "│ time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 ... 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ heightAboveGround float64 8B ...\n", | |
| "│ t2m (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: heightAboveGround\n", | |
| "├── DataTree('tcwv')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('entireAtmosphere')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2, time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ tcwv (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: entireAtmosphere\n", | |
| "├── DataTree('tp')\n", | |
| "│ └── DataTree('accum')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: accum\n", | |
| "│ └── DataTree('surface')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2, time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 43 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ surface float64 8B ...\n", | |
| "│ tp (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: surface\n", | |
| "├── DataTree('u')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2, time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ isobaricInhPa float64 8B ...\n", | |
| "│ u (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('u10')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('heightAboveGround')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2,\n", | |
| "│ time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 ... 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ heightAboveGround float64 8B ...\n", | |
| "│ u10 (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: heightAboveGround\n", | |
| "├── DataTree('v')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('isobaricInhPa')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2, time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ isobaricInhPa float64 8B ...\n", | |
| "│ v (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: isobaricInhPa\n", | |
| "├── DataTree('v10')\n", | |
| "│ └── DataTree('instant')\n", | |
| "│ │ Dimensions: ()\n", | |
| "│ │ Data variables:\n", | |
| "│ │ *empty*\n", | |
| "│ │ Attributes:\n", | |
| "│ │ stepType: instant\n", | |
| "│ └── DataTree('heightAboveGround')\n", | |
| "│ Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2,\n", | |
| "│ time: 1)\n", | |
| "│ Coordinates:\n", | |
| "│ * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.75 -90.0\n", | |
| "│ * longitude (longitude) float64 12kB -180.0 -179.8 ... 179.5 179.8\n", | |
| "│ * number (number) int64 408B 0 1 2 3 4 5 6 ... 45 46 47 48 49 50\n", | |
| "│ * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| "│ * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| "│ Data variables:\n", | |
| "│ heightAboveGround float64 8B ...\n", | |
| "│ v10 (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| "│ valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| "│ Attributes:\n", | |
| "│ typeOfLevel: heightAboveGround\n", | |
| "└── DataTree('vo')\n", | |
| " └── DataTree('instant')\n", | |
| " │ Dimensions: ()\n", | |
| " │ Data variables:\n", | |
| " │ *empty*\n", | |
| " │ Attributes:\n", | |
| " │ stepType: instant\n", | |
| " └── DataTree('isobaricInhPa')\n", | |
| " Dimensions: (latitude: 721, longitude: 1440, number: 51, step: 2, time: 1)\n", | |
| " Coordinates:\n", | |
| " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", | |
| " * longitude (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8\n", | |
| " * number (number) int64 408B 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", | |
| " * step (step) timedelta64[ns] 16B 00:00:00 03:00:00\n", | |
| " * time (time) datetime64[ns] 8B 2024-02-29\n", | |
| " Data variables:\n", | |
| " isobaricInhPa float64 8B ...\n", | |
| " valid_time (time, step, number) datetime64[ns] 816B ...\n", | |
| " vo (time, step, number, latitude, longitude) float64 847MB ...\n", | |
| " Attributes:\n", | |
| " typeOfLevel: isobaricInhPa)\n", | |
| "Frozen({})\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "ensemble_tree = fixed_ensemble_grib_tree(all_groups, debug_output=True)\n", | |
| "\n", | |
| "# Check the references directly\n", | |
| "print(f\"Total refs: {len(ensemble_tree['refs'])}\")\n", | |
| "# Look at structure - should have proper hierarchy\n", | |
| "print([key for key in ensemble_tree['refs'].keys() if key.count('/') <= 1][:10])\n", | |
| "\n", | |
| "# Open with datatree\n", | |
| "egfs_dt = datatree.open_datatree(\n", | |
| " fsspec.filesystem(\"reference\", fo=ensemble_tree).get_mapper(\"\"), \n", | |
| " engine=\"zarr\", \n", | |
| " consolidated=False\n", | |
| ")\n", | |
| "\n", | |
| "# Check for variables\n", | |
| "print(egfs_dt.keys())\n", | |
| "\n", | |
| "# Try accessing a variable (use an actual variable name from your data)\n", | |
| "var_node = egfs_dt['t2m'] # Or another variable name from your data\n", | |
| "print(var_node.dims) # Should show the 'number' dimension among others" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "db17cc34-b674-4341-94f8-150efaae9bc6", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.7" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment