Created
July 3, 2024 14:53
-
-
Save phinate/4b6335bea9cd276a92c956a3c645a33c to your computer and use it in GitHub Desktop.
EUMETSAT data scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import pyproj | |
import pyresample | |
import xarray | |
import numpy as np | |
def lon_lat_to_geostationary_area_coords( | |
x: float, | |
y: float, | |
xr_data: xarray.Dataset, | |
) -> tuple[float, float]: | |
"""Loads geostationary area and change from lon-lat to geostationaery coords | |
Args: | |
x: Longitude east-west | |
y: Latitude north-south | |
xr_data: xarray object with geostationary area | |
Returns: | |
Geostationary coords: x, y | |
""" | |
# WGS84 is short for "World Geodetic System 1984", used in GPS. Uses | |
# latitude and longitude. | |
WGS84 = 4326 | |
try: | |
area_definition_yaml = xr_data.attrs["area"] | |
except KeyError: | |
area_definition_yaml = xr_data.data.attrs["area"] | |
geostationary_area_definition = pyresample.area_config.load_area_from_string( | |
area_definition_yaml | |
) | |
geostationary_crs = geostationary_area_definition.crs | |
lonlat_to_geostationary = pyproj.Transformer.from_crs( | |
crs_from=WGS84, | |
crs_to=geostationary_crs, | |
always_xy=True, | |
).transform | |
return lonlat_to_geostationary(xx=x, yy=y) | |
# %% | |
import pandas as pd | |
import xarray | |
import ocf_blosc2 # this gives us access the blosc2 compression codec (?) | |
SATELLITE_ZARR_PATH = "gs://public-datasets-eumetsat-solar-forecasting/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr" | |
ds = xarray.open_zarr(SATELLITE_ZARR_PATH, chunks=None).sortby("time") | |
# %% | |
# check if the time difference is constant since the date values looked a lil bit off | |
def print_time_differences(xr_dataset: xarray.Dataset) -> None: | |
time_diffs = np.diff(xr_dataset.time.values.astype('datetime64[m]')) | |
unique_time_diffs, counts = np.unique(time_diffs, return_counts=True) | |
if len(unique_time_diffs) == 1: | |
print(f"There is only one unique time difference in the dataset: {unique_time_diffs[0]}") | |
return | |
print(f"There are {len(unique_time_diffs)} unique time differences in the dataset; the most common is {unique_time_diffs[0]} with {100*counts[0]/len(time_diffs):.5g}% frequency") | |
print_time_differences(ds) | |
# %% | |
start_time = "2020-01-01" | |
end_time = "2020-02-01" | |
data_inner_steps = 3 # how many steps to take in the time dimension (multiple of 5 minutes) | |
ds_filtered_time = ds.sel(time=slice(start_time, end_time, data_inner_steps)) | |
print_time_differences(ds_filtered_time) | |
# %% | |
lon_min = -16 | |
lon_max = 10 | |
lat_min = 45 | |
lat_max = 70 | |
# Convert lon-lat bounds to geostationary-coords | |
((x_min, x_max), (y_min, y_max)) = lon_lat_to_geostationary_area_coords( | |
[lon_min, lon_max], | |
[lat_min, lat_max], | |
ds_filtered_time.data, | |
) | |
# Define the spatial area to slice from | |
ds_filtered_space_time = ds_filtered_time.sel( | |
x_geostationary=slice(x_max, x_min), # x-axis is in decreasing order | |
y_geostationary=slice(y_min, y_max), | |
) | |
# %% | |
import memray | |
with memray.Tracker("save-data.bin"): | |
ds_filtered_space_time.to_zarr(f"{start_time}_{end_time}_EUMETSAT_SEVIRI_RSS_v4.zarr") | |
# %% | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@phinate
Instead of
ds_filtered_time = ds.sel(time=slice(start_time, end_time, data_inner_steps))
, I think the following might be better?It means we'd always have data on the the 00, 15, 30 and 45 minute mark rather than letting that vary. That might be cleaner?