Skip to content

Instantly share code, notes, and snippets.

@phinate
Created July 3, 2024 14:53
Show Gist options
  • Save phinate/4b6335bea9cd276a92c956a3c645a33c to your computer and use it in GitHub Desktop.
Save phinate/4b6335bea9cd276a92c956a3c645a33c to your computer and use it in GitHub Desktop.
EUMETSAT data scraping
# %%
import pyproj
import pyresample
import xarray
import numpy as np
def lon_lat_to_geostationary_area_coords(
x: float,
y: float,
xr_data: xarray.Dataset,
) -> tuple[float, float]:
"""Loads geostationary area and change from lon-lat to geostationaery coords
Args:
x: Longitude east-west
y: Latitude north-south
xr_data: xarray object with geostationary area
Returns:
Geostationary coords: x, y
"""
# WGS84 is short for "World Geodetic System 1984", used in GPS. Uses
# latitude and longitude.
WGS84 = 4326
try:
area_definition_yaml = xr_data.attrs["area"]
except KeyError:
area_definition_yaml = xr_data.data.attrs["area"]
geostationary_area_definition = pyresample.area_config.load_area_from_string(
area_definition_yaml
)
geostationary_crs = geostationary_area_definition.crs
lonlat_to_geostationary = pyproj.Transformer.from_crs(
crs_from=WGS84,
crs_to=geostationary_crs,
always_xy=True,
).transform
return lonlat_to_geostationary(xx=x, yy=y)
# %%
import pandas as pd
import xarray
import ocf_blosc2 # this gives us access the blosc2 compression codec (?)
SATELLITE_ZARR_PATH = "gs://public-datasets-eumetsat-solar-forecasting/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr"
ds = xarray.open_zarr(SATELLITE_ZARR_PATH, chunks=None).sortby("time")
# %%
# check if the time difference is constant since the date values looked a lil bit off
def print_time_differences(xr_dataset: xarray.Dataset) -> None:
time_diffs = np.diff(xr_dataset.time.values.astype('datetime64[m]'))
unique_time_diffs, counts = np.unique(time_diffs, return_counts=True)
if len(unique_time_diffs) == 1:
print(f"There is only one unique time difference in the dataset: {unique_time_diffs[0]}")
return
print(f"There are {len(unique_time_diffs)} unique time differences in the dataset; the most common is {unique_time_diffs[0]} with {100*counts[0]/len(time_diffs):.5g}% frequency")
print_time_differences(ds)
# %%
start_time = "2020-01-01"
end_time = "2020-02-01"
data_inner_steps = 3 # how many steps to take in the time dimension (multiple of 5 minutes)
ds_filtered_time = ds.sel(time=slice(start_time, end_time, data_inner_steps))
print_time_differences(ds_filtered_time)
# %%
lon_min = -16
lon_max = 10
lat_min = 45
lat_max = 70
# Convert lon-lat bounds to geostationary-coords
((x_min, x_max), (y_min, y_max)) = lon_lat_to_geostationary_area_coords(
[lon_min, lon_max],
[lat_min, lat_max],
ds_filtered_time.data,
)
# Define the spatial area to slice from
ds_filtered_space_time = ds_filtered_time.sel(
x_geostationary=slice(x_max, x_min), # x-axis is in decreasing order
y_geostationary=slice(y_min, y_max),
)
# %%
import memray
with memray.Tracker("save-data.bin"):
ds_filtered_space_time.to_zarr(f"{start_time}_{end_time}_EUMETSAT_SEVIRI_RSS_v4.zarr")
# %%
@dfulu
Copy link

dfulu commented Jul 3, 2024

@phinate

Instead of ds_filtered_time = ds.sel(time=slice(start_time, end_time, data_inner_steps)), I think the following might be better?

mask = np.mod(ds.time.dt.minute, data_inner_steps*5)==0
ds_filtered_time = ds.sel(time=mask)

It means we'd always have data on the the 00, 15, 30 and 45 minute mark rather than letting that vary. That might be cleaner?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment