Created
December 10, 2024 21:41
-
-
Save jshannon75/7d29cb4bcde1114423e8e99d7399c416 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Loading ERA5 data in Python and processing with R" | |
format: html | |
editor: visual | |
editor_options: | |
chunk_output_type: console | |
--- | |
```{r setup} | |
library(Rarr) | |
library(tidyverse) | |
library(lubridate) | |
library(raster) | |
library(ncdf4) | |
library(tmap) | |
library(sf) | |
library(gstat) | |
library(qgisprocess) | |
``` | |
This is the code that loads the ERA data from the online Zarr file. More info on data available on this page: | |
https://cloud.google.com/storage/docs/public-datasets/era5 | |
And this one: | |
https://github.com/google-research/arco-era5/tree/main | |
#Downloading data in R | |
The code below gets an overview the cloud-based single-level reanalysis data for ERA 5. | |
```{r} | |
s3_all<-"https://storage.googleapis.com/gcp-public-data-arco-era5/co/single-level-reanalysis.zarr-v2" | |
zarr_overview(s3_all) | |
``` | |
The origin date for the data is Jan 1, 1900. Let's get a list of all hours/dates in the dataset (the `time` variable) and then calculate the dates for each hour time slice. | |
```{r} | |
#zarr_overview(paste0(s3_all,"/time")) | |
s3_time<-read_zarr_array(paste0(s3_all,"/time")) | |
origin_date <- as.POSIXct("1900-01-01 00:00:00", tz = "UTC") | |
s3_time_df<-data.frame(time=s3_time) %>% | |
mutate(date=origin_date+(time*(60*60)), | |
hour=hour(date), | |
month=month(date), | |
year=year(date), | |
date=date(date), | |
month_year=paste0(month,"/",year)) %>% | |
filter(date<today()-90 & date > mdy("12/31/1939")) | |
#350617-1092816 is the range of valid values for time, which roughly corresponds to the Jan 1940-Aug 2024 window. There's a lag in data availability, which is why the cutoff is 90 days before today. | |
date_list<-unique(s3_time_df$date) | |
#tail(date_list) | |
``` | |
We can also download the universal lat/lon coordinates for the data grid and set up a blank raster within the boundaries of those points. | |
```{r} | |
lat_s3<-"https://storage.googleapis.com/gcp-public-data-arco-era5/co/single-level-reanalysis.zarr-v2/latitude" | |
lat<-data.frame(lat=read_zarr_array(lat_s3)) | |
lon_s3<-"https://storage.googleapis.com/gcp-public-data-arco-era5/co/single-level-reanalysis.zarr-v2/longitude" | |
lon<-data.frame(lon_raw=read_zarr_array(lon_s3)) %>% | |
mutate(lon=if_else(lon_raw>180,lon_raw-360,lon_raw)) | |
points<-bind_cols(lat,lon) %>% | |
st_as_sf(coords=c("lon","lat"),crs=4326) | |
st_write(points,"data/latlon.gpkg") | |
extent<-extent(-180,180,-90,90) | |
r <- raster(extent, res = 0.5,crs=4326) | |
grid<-st_as_sf(st_make_grid(r,cellsize=0.4,square = FALSE)) %>% | |
mutate(grid_id=paste0("G",str_pad(row_number(),6,pad="0"))) | |
# Attempt to write to NetCDF | |
# # Get raster properties | |
# lon_r <- xFromCol(r_sel, 1:nrow(r)) # Longitude values | |
# lat_r <- yFromRow(r_sel, 1:ncol(r)) # Latitude values | |
# | |
# # Create dimensions | |
# lon_dim <- ncdim_def("lon", "degrees_east", vals = lon_r, longname = "Longitude") | |
# lat_dim <- ncdim_def("lat", "degrees_north", vals = lat_r, longname = "Latitude") | |
# | |
# # Define CRS metadata | |
# crs <- ncvar_def("crs", "", dim = list(), longname = "Coordinate Reference System", | |
# grid_mapping_name = "latitude_longitude", | |
# epsg_code = "EPSG:4326") | |
``` | |
Now we can download data for a specific variable. The code below goes by each day within a range (24 hours), calculates a summary statistic (mean temp at 2 meters in this case), and then rasterizes those values. The last part of the code writes a GeoTIFF version of the data. | |
```{r} | |
model_data_s3 <- "https://storage.googleapis.com/gcp-public-data-arco-era5/co/single-level-reanalysis.zarr-v2/t2m" | |
date_sel=date_list[20000] | |
# Define the main data variable | |
# nc_file<-"data/era5_data/rasters/era5_data_t2m.nc" | |
# | |
# t2m_mean_var <- ncvar_def("T2m_mean", "degrees Kelvin", dim = list(lon_dim, lat_dim), | |
# longname = "Mean surface temperature") | |
# | |
# # Create NetCDF file | |
# nc <- nc_create(nc_file, list(lon_dim, lat_dim, crs, t2m_mean_var)) | |
date_avg<-function(date_sel){ | |
date_list_sel<-s3_time_df %>% | |
filter(date==date_sel) | |
start_row=min(date_list_sel$time) | |
end_row=max(date_list_sel$time) | |
var_data<-read_zarr_array(model_data_s3,index=list(c(start_row,end_row),NULL)) | |
var_mean<-apply(var_data,c(2),mean) | |
var_latlon<-data.frame(date=date_sel, | |
var=var_mean, | |
lon_raw=lon$lon, | |
lat=lat) %>% | |
mutate(lon=if_else(lon_raw>180,lon_raw-360,lon_raw)) %>% | |
st_as_sf(coords=c("lon","lat"),crs=4326) | |
qgis_show_help("saga:thinplatespline") | |
file_out<-paste0("rasters/",date_sel,".tif") | |
output <- qgis_run_algorithm( | |
"saga:thinplatespline", | |
SHAPES = var_latlon, # Input point layer | |
FIELD = "var", # Field containing values to interpolate | |
# TARGER_USER_XMIN_TARGET_USER_XMAX = "-180,180,-90,90", | |
# TARGET_USER_XMIN = "-180", # Minimum X for extent | |
# TARGET_USER_XMAX = 180, # Maximum X for extent | |
# TARGET_USER_YMIN = -90, # Minimum Y for extent | |
# TARGET_USER_YMAX = 90, # Maximum Y for extent | |
TARGET_USER_SIZE = 0.5, # Cell size for the raster | |
TARGET_OUT_GRID = file_out | |
) | |
# point_sum<-var_latlon %>% | |
# st_join(grid,join=st_within) %>% | |
# group_by(grid_id,date) %>% | |
# st_set_geometry(NULL) %>% | |
# summarise(var_mean=mean(var)) | |
# grid_var<-grid %>% | |
# inner_join(point_sum) | |
# | |
# tm_shape(grid_var)+tm_borders() | |
# st_write(grid_var,"data/grid_temp.gpkg") | |
# | |
# r_sel<-idw(var~1,var_latlon,stars::st_as_stars(r)) | |
# r_sel<-rasterize(var_latlon,r,field="var",fun=mean,background=NA) | |
# r_sel1<-terra::focal(r_sel,w=matrix(1/9, nc=5, nr=5),fun=mean,na.policy="only") | |
# writeRaster(r_sel,paste0("data/era5_data/rasters/",date_sel,".tif")) | |
} | |
dates_sel<-s3_time_df %>% | |
#filter(date %in% c("1940","1950","1960")) %>% | |
filter(month_year=="7/1975" & date>"1975-07-07") %>% | |
distinct(date) | |
map(dates_sel$date,date_avg) | |
``` | |
Create a gif map | |
```{r} | |
library(rnaturalearth) | |
library(tmaptools) | |
rasters<-list.files(path="rasters",pattern="1975",full.names=TRUE) | |
rasters <- rasters[grepl("tif", rasters)] | |
world <- ne_countries(scale = "medium", returnclass = "sf") | |
raster_map<-function(file_sel){ | |
#file_sel<-"rasters/1975-07-25.tif" | |
data1<-raster(file_sel) | |
data_date<-substr(file_sel,9,18) | |
map<-tm_shape(data1)+ | |
tm_raster(style= "quantile", n=7, | |
palette=get_brewer_pal("OrRd", n = 7, plot=FALSE))+ | |
tm_layout(legend.show = FALSE, | |
main.title=data_date, | |
main.title.size=1)+ | |
tm_shape(world)+ | |
tm_borders(col="black") | |
tmap_save(map,paste0("rasters/image/map_",data_date,".png"), | |
width=10,height=5) | |
} | |
map(rasters,raster_map) | |
raster_maps<-list.files(path="rasters/image",pattern="png",full.names=TRUE) | |
library(gifski) | |
gifski(raster_maps,gif_file="jul75_temp.gif",delay=0.5,width=1000,height=500) | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment