Skip to content

Instantly share code, notes, and snippets.

@aidanheerdegen
Created January 17, 2022 03:51
Show Gist options
  • Save aidanheerdegen/86435493875597b8d2bb5e2c5fc12556 to your computer and use it in GitHub Desktop.
Save aidanheerdegen/86435493875597b8d2bb5e2c5fc12556 to your computer and use it in GitHub Desktop.
Concatenate data files without correct time dimensions
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/usr/bin/env python
# coding: utf-8
import xarray as xr
from pathlib import Path
from dateutil import parser
import datetime
import sys
def open_ETH(file, origin = '1960-01-01', timechunk=50):
"""Open a netCDF file from ETH, alter the time metadata based on filename and return an xarray.Dataset
file : pathlib.Path object pointing to an ETH data file
origin: origin date for time coordinate
"""
# Open the data file, don't bother decoding, and immediately drop the time dimension
ds = xr.open_dataset(file, engine='netcdf4', decode_cf=False).squeeze(dim='time', drop=True)
# Parse the model time from the file name
modeltime = datetime.datetime.strptime( file.name, "C%Y%m%d_%H" )
# Calculate the offset from the origin to the modeltime in days
days = (modeltime - parser.parse(origin))/datetime.timedelta(days=1)
# Create a new time coordinate
time = xr.DataArray([days], dims = ['time'], coords = {'time': [days]})
# Add time coordinate back into data and set time units
ds = ds.expand_dims(time=time)
ds.time.attrs['units'] = f"days since {origin}"
# Set the chunking along the time dimension (first index) to timechunk
# for all data variables
for v in ds.data_vars:
ds[v].encoding['chunksizes'] = (timechunk, *ds[v].encoding['chunksizes'][1:])
# Return dataset
return ds
def concat_ETH(dir):
"""
Open all ETH data files in a specified directory and concatenate along the time axis
"""
# Use a list comprehension to open all the files in an array, and use that as input
# to the xarray.concat function
ds = xr.concat([open_ETH(f) for f in sorted(Path(dir).glob('*'))], dim='time')
# Save source directory as attribute to be used later. Note that a Path object will not
# append a slash to a directory name, regardless if that is how it was specified.
ds.attrs['source_directory'] = str(dir)
return ds
def save_ETH(ds, outpath=None):
"""
Save concatenated dataset to a netCDF file. This is just a wrapper around the to_netcdf
method, but does some manipulation of the encoding information so that it is saved with
the proper chunking and infers an output path if one is not provided
"""
# Create an encoding dictionary by grabbing the .encoding attributes from all the variables
enc = ds.encoding
for v in ds.variables:
enc[v] = ds[v].encoding
# Make sure the new time coordinate is not chunked and remove any unlimited dimensions
enc['time']['contiguous'] = True
try:
del(enc['unlimited_dims'])
except:
pass
if outpath is None:
# Set output path to be the name of the concatenated directory with `.nc` suffix
outpath = ds.attrs['source_directory'] + '.nc'
ds.to_netcdf(outpath, encoding=enc)
return outpath
def main(directories):
"""
Main routine that loops through the directories passed as command line arguments
"""
for d in directories:
ds = concat_ETH(d)
saved_file = save_ETH(ds)
print(f"Concarenated {d} to {saved_file}")
if __name__ == '__main__':
# Pass all command line arguments after command name to main
sys.exit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment