Created
January 17, 2022 03:51
-
-
Save aidanheerdegen/86435493875597b8d2bb5e2c5fc12556 to your computer and use it in GitHub Desktop.
Concatenate data files without correct time dimensions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import xarray as xr | |
from pathlib import Path | |
from dateutil import parser | |
import datetime | |
import sys | |
def open_ETH(file, origin = '1960-01-01', timechunk=50): | |
"""Open a netCDF file from ETH, alter the time metadata based on filename and return an xarray.Dataset | |
file : pathlib.Path object pointing to an ETH data file | |
origin: origin date for time coordinate | |
""" | |
# Open the data file, don't bother decoding, and immediately drop the time dimension | |
ds = xr.open_dataset(file, engine='netcdf4', decode_cf=False).squeeze(dim='time', drop=True) | |
# Parse the model time from the file name | |
modeltime = datetime.datetime.strptime( file.name, "C%Y%m%d_%H" ) | |
# Calculate the offset from the origin to the modeltime in days | |
days = (modeltime - parser.parse(origin))/datetime.timedelta(days=1) | |
# Create a new time coordinate | |
time = xr.DataArray([days], dims = ['time'], coords = {'time': [days]}) | |
# Add time coordinate back into data and set time units | |
ds = ds.expand_dims(time=time) | |
ds.time.attrs['units'] = f"days since {origin}" | |
# Set the chunking along the time dimension (first index) to timechunk | |
# for all data variables | |
for v in ds.data_vars: | |
ds[v].encoding['chunksizes'] = (timechunk, *ds[v].encoding['chunksizes'][1:]) | |
# Return dataset | |
return ds | |
def concat_ETH(dir): | |
""" | |
Open all ETH data files in a specified directory and concatenate along the time axis | |
""" | |
# Use a list comprehension to open all the files in an array, and use that as input | |
# to the xarray.concat function | |
ds = xr.concat([open_ETH(f) for f in sorted(Path(dir).glob('*'))], dim='time') | |
# Save source directory as attribute to be used later. Note that a Path object will not | |
# append a slash to a directory name, regardless if that is how it was specified. | |
ds.attrs['source_directory'] = str(dir) | |
return ds | |
def save_ETH(ds, outpath=None): | |
""" | |
Save concatenated dataset to a netCDF file. This is just a wrapper around the to_netcdf | |
method, but does some manipulation of the encoding information so that it is saved with | |
the proper chunking and infers an output path if one is not provided | |
""" | |
# Create an encoding dictionary by grabbing the .encoding attributes from all the variables | |
enc = ds.encoding | |
for v in ds.variables: | |
enc[v] = ds[v].encoding | |
# Make sure the new time coordinate is not chunked and remove any unlimited dimensions | |
enc['time']['contiguous'] = True | |
try: | |
del(enc['unlimited_dims']) | |
except: | |
pass | |
if outpath is None: | |
# Set output path to be the name of the concatenated directory with `.nc` suffix | |
outpath = ds.attrs['source_directory'] + '.nc' | |
ds.to_netcdf(outpath, encoding=enc) | |
return outpath | |
def main(directories): | |
""" | |
Main routine that loops through the directories passed as command line arguments | |
""" | |
for d in directories: | |
ds = concat_ETH(d) | |
saved_file = save_ETH(ds) | |
print(f"Concarenated {d} to {saved_file}") | |
if __name__ == '__main__': | |
# Pass all command line arguments after command name to main | |
sys.exit(main(sys.argv[1:])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment