Skip to content

Instantly share code, notes, and snippets.

@andersy005
Last active July 18, 2020 05:06
Show Gist options
  • Select an option

  • Save andersy005/5fb0954efd9e5fd6ea29d95c73ec6e76 to your computer and use it in GitHub Desktop.

Select an option

Save andersy005/5fb0954efd9e5fd6ea29d95c73ec6e76 to your computer and use it in GitHub Desktop.
import os.path
import warnings
from glob import glob
from io import BytesIO
from numbers import Number
from pathlib import Path
from xarray.backends.common import AbstractDataStore, ArrayWriter
from xarray.core.utils import close_on_error, is_grib_path, is_remote_uri
from xarray.core import indexing
from xarray import backends, conventions, coding
def _get_default_engine_remote_uri():
try:
import netCDF4 # noqa: F401
engine = "netcdf4"
except ImportError: # pragma: no cover
try:
import pydap # noqa: F401
engine = "pydap"
except ImportError:
raise ValueError(
"netCDF4 or pydap is required for accessing "
"remote datasets via OPeNDAP"
)
return engine
def _get_default_engine_grib():
msgs = []
try:
import Nio # noqa: F401
msgs += ["set engine='pynio' to access GRIB files with PyNIO"]
except ImportError: # pragma: no cover
pass
try:
import cfgrib # noqa: F401
msgs += ["set engine='cfgrib' to access GRIB files with cfgrib"]
except ImportError: # pragma: no cover
pass
if msgs:
raise ValueError(" or\n".join(msgs))
else:
raise ValueError("PyNIO or cfgrib is required for accessing " "GRIB files")
def _get_default_engine_gz():
try:
import scipy # noqa: F401
engine = "scipy"
except ImportError: # pragma: no cover
raise ValueError("scipy is required for accessing .gz files")
return engine
def _get_default_engine_netcdf():
try:
import netCDF4 # noqa: F401
engine = "netcdf4"
except ImportError: # pragma: no cover
try:
import scipy.io.netcdf # noqa: F401
engine = "scipy"
except ImportError:
raise ValueError(
"cannot read or write netCDF files without "
"netCDF4-python or scipy installed"
)
return engine
def _get_engine_from_magic_number(filename_or_obj):
# check byte header to determine file type
if isinstance(filename_or_obj, bytes):
magic_number = filename_or_obj[:8]
else:
if filename_or_obj.tell() != 0:
raise ValueError(
"file-like object read/write pointer not at zero "
"please close and reopen, or use a context "
"manager"
)
magic_number = filename_or_obj.read(8)
filename_or_obj.seek(0)
if magic_number.startswith(b"CDF"):
engine = "scipy"
elif magic_number.startswith(b"\211HDF\r\n\032\n"):
engine = "h5netcdf"
if isinstance(filename_or_obj, bytes):
raise ValueError(
"can't open netCDF4/HDF5 as bytes "
"try passing a path or file-like object"
)
else:
if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
filename_or_obj = filename_or_obj[:80] + b"..."
raise ValueError(
"{} is not a valid netCDF file "
"did you mean to pass a string for a path instead?".format(filename_or_obj)
)
return engine
def _get_default_engine(path, allow_remote=False):
if allow_remote and is_remote_uri(path):
engine = _get_default_engine_remote_uri()
elif is_grib_path(path):
engine = _get_default_engine_grib()
elif path.endswith(".gz"):
engine = _get_default_engine_gz()
else:
engine = _get_default_engine_netcdf()
return engine
def _normalize_path(path):
if is_remote_uri(path):
return path
else:
return os.path.abspath(os.path.expanduser(path))
def _validate_dataset_names(dataset):
"""DataArray.name and Dataset keys must be a string or None"""
def check_name(name):
if isinstance(name, str):
if not name:
raise ValueError(
"Invalid name for DataArray or Dataset key: "
"string must be length 1 or greater for "
"serialization to netCDF files"
)
elif name is not None:
raise TypeError(
"DataArray.name or Dataset key must be either a "
"string or None for serialization to netCDF files"
)
for k in dataset.variables:
check_name(k)
def _validate_attrs(dataset):
"""`attrs` must have a string key and a value which is either: a number,
a string, an ndarray or a list/tuple of numbers/strings.
"""
def check_attr(name, value):
if isinstance(name, str):
if not name:
raise ValueError(
"Invalid name for attr: string must be "
"length 1 or greater for serialization to "
"netCDF files"
)
else:
raise TypeError(
"Invalid name for attr: {} must be a string for "
"serialization to netCDF files".format(name)
)
if not isinstance(value, (str, Number, np.ndarray, np.number, list, tuple)):
raise TypeError(
"Invalid value for attr: {} must be a number, "
"a string, an ndarray or a list/tuple of "
"numbers/strings for serialization to netCDF "
"files".format(value)
)
# Check attrs on the dataset itself
for k, v in dataset.attrs.items():
check_attr(k, v)
# Check attrs on each variable within the dataset
for variable in dataset.variables.values():
for k, v in variable.attrs.items():
check_attr(k, v)
def _protect_dataset_variables_inplace(dataset, cache):
for name, variable in dataset.variables.items():
if name not in variable.dims:
# no need to protect IndexVariable objects
data = indexing.CopyOnWriteArray(variable._data)
if cache:
data = indexing.MemoryCachedArray(data)
variable.data = data
def _finalize_store(write, store):
""" Finalize this store by explicitly syncing and closing"""
del write # ensure writing is done first
store.close()
def maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine, lock=False):
ds = conventions.decode_cf(
store,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters,
decode_coords=decode_coords,
drop_variables=drop_variables,
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
)
_protect_dataset_variables_inplace(ds, cache)
if chunks is not None:
from dask.base import tokenize
# if passed an actual file path, augment the token with
# the file modification time
if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
mtime = os.path.getmtime(filename_or_obj)
else:
mtime = None
token = tokenize(
filename_or_obj,
mtime,
group,
decode_cf,
mask_and_scale,
decode_times,
concat_characters,
decode_coords,
engine,
chunks,
drop_variables,
use_cftime,
decode_timedelta,
)
name_prefix = "open_dataset-%s" % token
ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
ds2._file_obj = ds._file_obj
else:
ds2 = ds
return ds2
def open_dataset(
filename_or_obj,
group=None,
decode_cf=True,
mask_and_scale=None,
decode_times=True,
autoclose=None,
concat_characters=True,
decode_coords=True,
engine=None,
chunks=None,
lock=None,
cache=None,
drop_variables=None,
backend_kwargs=None,
use_cftime=None,
decode_timedelta=None,
):
"""Open and decode a dataset from a file or file-like object.
Parameters
----------
filename_or_obj : str, Path, file or xarray.backends.*DataStore
Strings and Path objects are interpreted as a path to a netCDF file
or an OpenDAP URL and opened with python-netCDF4, unless the filename
ends with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
decode_cf : bool, optional
Whether to decode these variables, assuming they were saved according
to CF conventions.
mask_and_scale : bool, optional
If True, replace array values equal to `_FillValue` with NA and scale
values according to the formula `original_values * scale_factor +
add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
taken from variable attributes (if they exist). If the `_FillValue` or
`missing_value` attribute contains multiple values a warning will be
issued and all array values matching one of the multiple values will
be replaced by NA. mask_and_scale defaults to True except for the
pseudonetcdf backend.
decode_times : bool, optional
If True, decode times encoded in the standard NetCDF datetime format
into datetime objects. Otherwise, leave them encoded as numbers.
autoclose : bool, optional
If True, automatically close files to avoid OS Error of too many files
being open. However, this option doesn't work with streams, e.g.,
BytesIO.
concat_characters : bool, optional
If True, concatenate along the last dimension of character arrays to
form string arrays. Dimensions will only be concatenated over (and
removed) if they have no corresponding variable and if they are only
used as the last dimension of character arrays.
decode_coords : bool, optional
If True, decode the 'coordinates' attribute to identify coordinates in
the resulting dataset.
engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib', \
'pseudonetcdf'}, optional
Engine to use when reading files. If not provided, the default engine
is chosen based on available dependencies, with a preference for
'netcdf4'.
chunks : int or dict, optional
If chunks is provided, it used to load the new dataset into dask
arrays. ``chunks={}`` loads the dataset with dask using a single
chunk for all arrays.
lock : False or duck threading.Lock, optional
Resource lock to use when reading data from disk. Only relevant when
using dask or another form of parallelism. By default, appropriate
locks are chosen to safely read and write files with the currently
active dask scheduler.
cache : bool, optional
If True, cache data loaded from the underlying datastore in memory as
NumPy arrays when accessed to avoid reading from the underlying data-
store multiple times. Defaults to True unless you specify the `chunks`
argument to use dask, in which case it defaults to False. Does not
change the behavior of coordinates corresponding to dimensions, which
always load their data from disk into a ``pandas.Index``.
drop_variables: string or iterable, optional
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
backend_kwargs: dictionary, optional
A dictionary of keyword arguments to pass on to the backend. This
may be useful when backend options would improve performance or
allow user control of dataset processing.
use_cftime: bool, optional
Only relevant if encoded dates come from a standard calendar
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
specified). If None (default), attempt to decode times to
``np.datetime64[ns]`` objects; if this is not possible, decode times to
``cftime.datetime`` objects. If True, always decode times to
``cftime.datetime`` objects, regardless of whether or not they can be
represented using ``np.datetime64[ns]`` objects. If False, always
decode times to ``np.datetime64[ns]`` objects; if this is not possible
raise an error.
decode_timedelta : bool, optional
If True, decode variables and coordinates with time units in
{'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'}
into timedelta objects. If False, leave them encoded as numbers.
If None (default), assume the same value of decode_time.
Returns
-------
dataset : Dataset
The newly created dataset.
Notes
-----
``open_dataset`` opens the file with read-only access. When you modify
values of a Dataset, even one linked to files on disk, only the in-memory
copy you are manipulating in xarray is modified: the original file on disk
is never touched.
See Also
--------
open_mfdataset
"""
engines = [
None,
"netcdf4",
"scipy",
"pydap",
"h5netcdf",
"pynio",
"cfgrib",
"pseudonetcdf",
]
if engine not in engines:
raise ValueError(
"unrecognized engine for open_dataset: {}\n"
"must be one of: {}".format(engine, engines)
)
if autoclose is not None:
warnings.warn(
"The autoclose argument is no longer used by "
"xarray.open_dataset() and is now ignored; it will be removed in "
"a future version of xarray. If necessary, you can control the "
"maximum number of simultaneous open files with "
"xarray.set_options(file_cache_maxsize=...).",
FutureWarning,
stacklevel=2,
)
if mask_and_scale is None:
mask_and_scale = not engine == "pseudonetcdf"
if not decode_cf:
mask_and_scale = False
decode_times = False
concat_characters = False
decode_coords = False
decode_timedelta = False
if cache is None:
cache = chunks is None
if backend_kwargs is None:
backend_kwargs = {}
if isinstance(filename_or_obj, Path):
filename_or_obj = str(filename_or_obj)
if isinstance(filename_or_obj, AbstractDataStore):
store = filename_or_obj
elif isinstance(filename_or_obj, str):
filename_or_obj = _normalize_path(filename_or_obj)
if engine is None:
engine = _get_default_engine(filename_or_obj, allow_remote=True)
if engine == "netcdf4":
store = backends.NetCDF4DataStore.open(
filename_or_obj, group=group, lock=lock, **backend_kwargs
)
elif engine == "scipy":
store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
elif engine == "pydap":
store = backends.PydapDataStore.open(filename_or_obj, **backend_kwargs)
elif engine == "h5netcdf":
store = backends.H5NetCDFStore.open(
filename_or_obj, group=group, lock=lock, **backend_kwargs
)
elif engine == "pynio":
store = backends.NioDataStore(filename_or_obj, lock=lock, **backend_kwargs)
elif engine == "pseudonetcdf":
store = backends.PseudoNetCDFDataStore.open(
filename_or_obj, lock=lock, **backend_kwargs
)
elif engine == "cfgrib":
store = backends.CfGribDataStore(
filename_or_obj, lock=lock, **backend_kwargs
)
else:
if engine not in [None, "scipy", "h5netcdf"]:
raise ValueError(
"can only read bytes or file-like objects "
"with engine='scipy' or 'h5netcdf'"
)
engine = _get_engine_from_magic_number(filename_or_obj)
if engine == "scipy":
store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
elif engine == "h5netcdf":
store = backends.H5NetCDFStore.open(
filename_or_obj, group=group, lock=lock, **backend_kwargs
)
with close_on_error(store):
ds = maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine)
# Ensure source filename always stored in dataset object (GH issue #2550)
if "source" not in ds.encoding:
if isinstance(filename_or_obj, str):
ds.encoding["source"] = filename_or_obj
return ds
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import xarray as xr\n",
"import netCDF4 as nc\n",
"from netCDF4 import Dataset\n",
"from api import open_dataset, maybe_decode_store\n",
"xr.open_dataset = open_dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"f = \"~/.xarray_tutorial_data/air_temperature.nc\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><svg style=\"position: absolute; width: 0; height: 0; overflow: hidden\">\n",
"<defs>\n",
"<symbol id=\"icon-database\" viewBox=\"0 0 32 32\">\n",
"<path d=\"M16 0c-8.837 0-16 2.239-16 5v4c0 2.761 7.163 5 16 5s16-2.239 16-5v-4c0-2.761-7.163-5-16-5z\"></path>\n",
"<path d=\"M16 17c-8.837 0-16-2.239-16-5v6c0 2.761 7.163 5 16 5s16-2.239 16-5v-6c0 2.761-7.163 5-16 5z\"></path>\n",
"<path d=\"M16 26c-8.837 0-16-2.239-16-5v6c0 2.761 7.163 5 16 5s16-2.239 16-5v-6c0 2.761-7.163 5-16 5z\"></path>\n",
"</symbol>\n",
"<symbol id=\"icon-file-text2\" viewBox=\"0 0 32 32\">\n",
"<path d=\"M28.681 7.159c-0.694-0.947-1.662-2.053-2.724-3.116s-2.169-2.030-3.116-2.724c-1.612-1.182-2.393-1.319-2.841-1.319h-15.5c-1.378 0-2.5 1.121-2.5 2.5v27c0 1.378 1.122 2.5 2.5 2.5h23c1.378 0 2.5-1.122 2.5-2.5v-19.5c0-0.448-0.137-1.23-1.319-2.841zM24.543 5.457c0.959 0.959 1.712 1.825 2.268 2.543h-4.811v-4.811c0.718 0.556 1.584 1.309 2.543 2.268zM28 29.5c0 0.271-0.229 0.5-0.5 0.5h-23c-0.271 0-0.5-0.229-0.5-0.5v-27c0-0.271 0.229-0.5 0.5-0.5 0 0 15.499-0 15.5 0v7c0 0.552 0.448 1 1 1h7v19.5z\"></path>\n",
"<path d=\"M23 26h-14c-0.552 0-1-0.448-1-1s0.448-1 1-1h14c0.552 0 1 0.448 1 1s-0.448 1-1 1z\"></path>\n",
"<path d=\"M23 22h-14c-0.552 0-1-0.448-1-1s0.448-1 1-1h14c0.552 0 1 0.448 1 1s-0.448 1-1 1z\"></path>\n",
"<path d=\"M23 18h-14c-0.552 0-1-0.448-1-1s0.448-1 1-1h14c0.552 0 1 0.448 1 1s-0.448 1-1 1z\"></path>\n",
"</symbol>\n",
"</defs>\n",
"</svg>\n",
"<style>/* CSS stylesheet for displaying xarray objects in jupyterlab.\n",
" *\n",
" */\n",
"\n",
":root {\n",
" --xr-font-color0: var(--jp-content-font-color0, rgba(0, 0, 0, 1));\n",
" --xr-font-color2: var(--jp-content-font-color2, rgba(0, 0, 0, 0.54));\n",
" --xr-font-color3: var(--jp-content-font-color3, rgba(0, 0, 0, 0.38));\n",
" --xr-border-color: var(--jp-border-color2, #e0e0e0);\n",
" --xr-disabled-color: var(--jp-layout-color3, #bdbdbd);\n",
" --xr-background-color: var(--jp-layout-color0, white);\n",
" --xr-background-color-row-even: var(--jp-layout-color1, white);\n",
" --xr-background-color-row-odd: var(--jp-layout-color2, #eeeeee);\n",
"}\n",
"\n",
"html[theme=dark],\n",
"body.vscode-dark {\n",
" --xr-font-color0: rgba(255, 255, 255, 1);\n",
" --xr-font-color2: rgba(255, 255, 255, 0.54);\n",
" --xr-font-color3: rgba(255, 255, 255, 0.38);\n",
" --xr-border-color: #1F1F1F;\n",
" --xr-disabled-color: #515151;\n",
" --xr-background-color: #111111;\n",
" --xr-background-color-row-even: #111111;\n",
" --xr-background-color-row-odd: #313131;\n",
"}\n",
"\n",
".xr-wrap {\n",
" display: block;\n",
" min-width: 300px;\n",
" max-width: 700px;\n",
"}\n",
"\n",
".xr-text-repr-fallback {\n",
" /* fallback to plain text repr when CSS is not injected (untrusted notebook) */\n",
" display: none;\n",
"}\n",
"\n",
".xr-header {\n",
" padding-top: 6px;\n",
" padding-bottom: 6px;\n",
" margin-bottom: 4px;\n",
" border-bottom: solid 1px var(--xr-border-color);\n",
"}\n",
"\n",
".xr-header > div,\n",
".xr-header > ul {\n",
" display: inline;\n",
" margin-top: 0;\n",
" margin-bottom: 0;\n",
"}\n",
"\n",
".xr-obj-type,\n",
".xr-array-name {\n",
" margin-left: 2px;\n",
" margin-right: 10px;\n",
"}\n",
"\n",
".xr-obj-type {\n",
" color: var(--xr-font-color2);\n",
"}\n",
"\n",
".xr-sections {\n",
" padding-left: 0 !important;\n",
" display: grid;\n",
" grid-template-columns: 150px auto auto 1fr 20px 20px;\n",
"}\n",
"\n",
".xr-section-item {\n",
" display: contents;\n",
"}\n",
"\n",
".xr-section-item input {\n",
" display: none;\n",
"}\n",
"\n",
".xr-section-item input + label {\n",
" color: var(--xr-disabled-color);\n",
"}\n",
"\n",
".xr-section-item input:enabled + label {\n",
" cursor: pointer;\n",
" color: var(--xr-font-color2);\n",
"}\n",
"\n",
".xr-section-item input:enabled + label:hover {\n",
" color: var(--xr-font-color0);\n",
"}\n",
"\n",
".xr-section-summary {\n",
" grid-column: 1;\n",
" color: var(--xr-font-color2);\n",
" font-weight: 500;\n",
"}\n",
"\n",
".xr-section-summary > span {\n",
" display: inline-block;\n",
" padding-left: 0.5em;\n",
"}\n",
"\n",
".xr-section-summary-in:disabled + label {\n",
" color: var(--xr-font-color2);\n",
"}\n",
"\n",
".xr-section-summary-in + label:before {\n",
" display: inline-block;\n",
" content: '►';\n",
" font-size: 11px;\n",
" width: 15px;\n",
" text-align: center;\n",
"}\n",
"\n",
".xr-section-summary-in:disabled + label:before {\n",
" color: var(--xr-disabled-color);\n",
"}\n",
"\n",
".xr-section-summary-in:checked + label:before {\n",
" content: '▼';\n",
"}\n",
"\n",
".xr-section-summary-in:checked + label > span {\n",
" display: none;\n",
"}\n",
"\n",
".xr-section-summary,\n",
".xr-section-inline-details {\n",
" padding-top: 4px;\n",
" padding-bottom: 4px;\n",
"}\n",
"\n",
".xr-section-inline-details {\n",
" grid-column: 2 / -1;\n",
"}\n",
"\n",
".xr-section-details {\n",
" display: none;\n",
" grid-column: 1 / -1;\n",
" margin-bottom: 5px;\n",
"}\n",
"\n",
".xr-section-summary-in:checked ~ .xr-section-details {\n",
" display: contents;\n",
"}\n",
"\n",
".xr-array-wrap {\n",
" grid-column: 1 / -1;\n",
" display: grid;\n",
" grid-template-columns: 20px auto;\n",
"}\n",
"\n",
".xr-array-wrap > label {\n",
" grid-column: 1;\n",
" vertical-align: top;\n",
"}\n",
"\n",
".xr-preview {\n",
" color: var(--xr-font-color3);\n",
"}\n",
"\n",
".xr-array-preview,\n",
".xr-array-data {\n",
" padding: 0 5px !important;\n",
" grid-column: 2;\n",
"}\n",
"\n",
".xr-array-data,\n",
".xr-array-in:checked ~ .xr-array-preview {\n",
" display: none;\n",
"}\n",
"\n",
".xr-array-in:checked ~ .xr-array-data,\n",
".xr-array-preview {\n",
" display: inline-block;\n",
"}\n",
"\n",
".xr-dim-list {\n",
" display: inline-block !important;\n",
" list-style: none;\n",
" padding: 0 !important;\n",
" margin: 0;\n",
"}\n",
"\n",
".xr-dim-list li {\n",
" display: inline-block;\n",
" padding: 0;\n",
" margin: 0;\n",
"}\n",
"\n",
".xr-dim-list:before {\n",
" content: '(';\n",
"}\n",
"\n",
".xr-dim-list:after {\n",
" content: ')';\n",
"}\n",
"\n",
".xr-dim-list li:not(:last-child):after {\n",
" content: ',';\n",
" padding-right: 5px;\n",
"}\n",
"\n",
".xr-has-index {\n",
" font-weight: bold;\n",
"}\n",
"\n",
".xr-var-list,\n",
".xr-var-item {\n",
" display: contents;\n",
"}\n",
"\n",
".xr-var-item > div,\n",
".xr-var-item label,\n",
".xr-var-item > .xr-var-name span {\n",
" background-color: var(--xr-background-color-row-even);\n",
" margin-bottom: 0;\n",
"}\n",
"\n",
".xr-var-item > .xr-var-name:hover span {\n",
" padding-right: 5px;\n",
"}\n",
"\n",
".xr-var-list > li:nth-child(odd) > div,\n",
".xr-var-list > li:nth-child(odd) > label,\n",
".xr-var-list > li:nth-child(odd) > .xr-var-name span {\n",
" background-color: var(--xr-background-color-row-odd);\n",
"}\n",
"\n",
".xr-var-name {\n",
" grid-column: 1;\n",
"}\n",
"\n",
".xr-var-dims {\n",
" grid-column: 2;\n",
"}\n",
"\n",
".xr-var-dtype {\n",
" grid-column: 3;\n",
" text-align: right;\n",
" color: var(--xr-font-color2);\n",
"}\n",
"\n",
".xr-var-preview {\n",
" grid-column: 4;\n",
"}\n",
"\n",
".xr-var-name,\n",
".xr-var-dims,\n",
".xr-var-dtype,\n",
".xr-preview,\n",
".xr-attrs dt {\n",
" white-space: nowrap;\n",
" overflow: hidden;\n",
" text-overflow: ellipsis;\n",
" padding-right: 10px;\n",
"}\n",
"\n",
".xr-var-name:hover,\n",
".xr-var-dims:hover,\n",
".xr-var-dtype:hover,\n",
".xr-attrs dt:hover {\n",
" overflow: visible;\n",
" width: auto;\n",
" z-index: 1;\n",
"}\n",
"\n",
".xr-var-attrs,\n",
".xr-var-data {\n",
" display: none;\n",
" background-color: var(--xr-background-color) !important;\n",
" padding-bottom: 5px !important;\n",
"}\n",
"\n",
".xr-var-attrs-in:checked ~ .xr-var-attrs,\n",
".xr-var-data-in:checked ~ .xr-var-data {\n",
" display: block;\n",
"}\n",
"\n",
".xr-var-data > table {\n",
" float: right;\n",
"}\n",
"\n",
".xr-var-name span,\n",
".xr-var-data,\n",
".xr-attrs {\n",
" padding-left: 25px !important;\n",
"}\n",
"\n",
".xr-attrs,\n",
".xr-var-attrs,\n",
".xr-var-data {\n",
" grid-column: 1 / -1;\n",
"}\n",
"\n",
"dl.xr-attrs {\n",
" padding: 0;\n",
" margin: 0;\n",
" display: grid;\n",
" grid-template-columns: 125px auto;\n",
"}\n",
"\n",
".xr-attrs dt, dd {\n",
" padding: 0;\n",
" margin: 0;\n",
" float: left;\n",
" padding-right: 10px;\n",
" width: auto;\n",
"}\n",
"\n",
".xr-attrs dt {\n",
" font-weight: normal;\n",
" grid-column: 1;\n",
"}\n",
"\n",
".xr-attrs dt:hover span {\n",
" display: inline-block;\n",
" background: var(--xr-background-color);\n",
" padding-right: 10px;\n",
"}\n",
"\n",
".xr-attrs dd {\n",
" grid-column: 2;\n",
" white-space: pre-wrap;\n",
" word-break: break-all;\n",
"}\n",
"\n",
".xr-icon-database,\n",
".xr-icon-file-text2 {\n",
" display: inline-block;\n",
" vertical-align: middle;\n",
" width: 1em;\n",
" height: 1.5em !important;\n",
" stroke-width: 0;\n",
" stroke: currentColor;\n",
" fill: currentColor;\n",
"}\n",
"</style><pre class='xr-text-repr-fallback'>&lt;xarray.Dataset&gt;\n",
"Dimensions: (lat: 25, lon: 53, time: 2920)\n",
"Coordinates:\n",
" * lat (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
" * lon (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
" * time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
"Data variables:\n",
" air (time, lat, lon) float32 ...\n",
"Attributes:\n",
" Conventions: COARDS\n",
" title: 4x daily NMC reanalysis (1948)\n",
" description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n",
" platform: Model\n",
" references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...</pre><div class='xr-wrap' hidden><div class='xr-header'><div class='xr-obj-type'>xarray.Dataset</div></div><ul class='xr-sections'><li class='xr-section-item'><input id='section-dfd13d8e-4433-4254-b735-e8ca0b13fbb3' class='xr-section-summary-in' type='checkbox' disabled ><label for='section-dfd13d8e-4433-4254-b735-e8ca0b13fbb3' class='xr-section-summary' title='Expand/collapse section'>Dimensions:</label><div class='xr-section-inline-details'><ul class='xr-dim-list'><li><span class='xr-has-index'>lat</span>: 25</li><li><span class='xr-has-index'>lon</span>: 53</li><li><span class='xr-has-index'>time</span>: 2920</li></ul></div><div class='xr-section-details'></div></li><li class='xr-section-item'><input id='section-4e2d48c0-dc00-482f-9a95-58e8864d594b' class='xr-section-summary-in' type='checkbox' checked><label for='section-4e2d48c0-dc00-482f-9a95-58e8864d594b' class='xr-section-summary' >Coordinates: <span>(3)</span></label><div class='xr-section-inline-details'></div><div class='xr-section-details'><ul class='xr-var-list'><li class='xr-var-item'><div class='xr-var-name'><span class='xr-has-index'>lat</span></div><div class='xr-var-dims'>(lat)</div><div class='xr-var-dtype'>float32</div><div class='xr-var-preview xr-preview'>75.0 72.5 70.0 ... 20.0 17.5 15.0</div><input id='attrs-f848eb78-8b71-4655-a9f4-76e069e29057' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-f848eb78-8b71-4655-a9f4-76e069e29057' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-61fcd39d-b03f-4612-bc39-ce48316632fb' class='xr-var-data-in' type='checkbox'><label for='data-61fcd39d-b03f-4612-bc39-ce48316632fb' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>standard_name :</span></dt><dd>latitude</dd><dt><span>long_name :</span></dt><dd>Latitude</dd><dt><span>units :</span></dt><dd>degrees_north</dd><dt><span>axis :</span></dt><dd>Y</dd></dl></div><div class='xr-var-data'><pre>array([75. , 72.5, 70. , 67.5, 65. , 62.5, 60. , 57.5, 55. , 52.5, 50. , 47.5,\n",
" 45. , 42.5, 40. , 37.5, 35. , 32.5, 30. , 27.5, 25. , 22.5, 20. , 17.5,\n",
" 15. ], dtype=float32)</pre></div></li><li class='xr-var-item'><div class='xr-var-name'><span class='xr-has-index'>lon</span></div><div class='xr-var-dims'>(lon)</div><div class='xr-var-dtype'>float32</div><div class='xr-var-preview xr-preview'>200.0 202.5 205.0 ... 327.5 330.0</div><input id='attrs-1c42a6a7-1034-4359-b571-022ed7b45edf' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-1c42a6a7-1034-4359-b571-022ed7b45edf' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-732ac6d3-1d2b-49a5-8b9e-e0d6e47f4eb7' class='xr-var-data-in' type='checkbox'><label for='data-732ac6d3-1d2b-49a5-8b9e-e0d6e47f4eb7' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>standard_name :</span></dt><dd>longitude</dd><dt><span>long_name :</span></dt><dd>Longitude</dd><dt><span>units :</span></dt><dd>degrees_east</dd><dt><span>axis :</span></dt><dd>X</dd></dl></div><div class='xr-var-data'><pre>array([200. , 202.5, 205. , 207.5, 210. , 212.5, 215. , 217.5, 220. , 222.5,\n",
" 225. , 227.5, 230. , 232.5, 235. , 237.5, 240. , 242.5, 245. , 247.5,\n",
" 250. , 252.5, 255. , 257.5, 260. , 262.5, 265. , 267.5, 270. , 272.5,\n",
" 275. , 277.5, 280. , 282.5, 285. , 287.5, 290. , 292.5, 295. , 297.5,\n",
" 300. , 302.5, 305. , 307.5, 310. , 312.5, 315. , 317.5, 320. , 322.5,\n",
" 325. , 327.5, 330. ], dtype=float32)</pre></div></li><li class='xr-var-item'><div class='xr-var-name'><span class='xr-has-index'>time</span></div><div class='xr-var-dims'>(time)</div><div class='xr-var-dtype'>datetime64[ns]</div><div class='xr-var-preview xr-preview'>2013-01-01 ... 2014-12-31T18:00:00</div><input id='attrs-e60f7082-e5ab-445d-a478-58d68a6ff6d7' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-e60f7082-e5ab-445d-a478-58d68a6ff6d7' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-29b258fe-0129-495d-8f96-ab8697eaa374' class='xr-var-data-in' type='checkbox'><label for='data-29b258fe-0129-495d-8f96-ab8697eaa374' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>standard_name :</span></dt><dd>time</dd><dt><span>long_name :</span></dt><dd>Time</dd></dl></div><div class='xr-var-data'><pre>array([&#x27;2013-01-01T00:00:00.000000000&#x27;, &#x27;2013-01-01T06:00:00.000000000&#x27;,\n",
" &#x27;2013-01-01T12:00:00.000000000&#x27;, ..., &#x27;2014-12-31T06:00:00.000000000&#x27;,\n",
" &#x27;2014-12-31T12:00:00.000000000&#x27;, &#x27;2014-12-31T18:00:00.000000000&#x27;],\n",
" dtype=&#x27;datetime64[ns]&#x27;)</pre></div></li></ul></div></li><li class='xr-section-item'><input id='section-52573bf5-8ddc-4d57-840f-d83b96af4671' class='xr-section-summary-in' type='checkbox' checked><label for='section-52573bf5-8ddc-4d57-840f-d83b96af4671' class='xr-section-summary' >Data variables: <span>(1)</span></label><div class='xr-section-inline-details'></div><div class='xr-section-details'><ul class='xr-var-list'><li class='xr-var-item'><div class='xr-var-name'><span>air</span></div><div class='xr-var-dims'>(time, lat, lon)</div><div class='xr-var-dtype'>float32</div><div class='xr-var-preview xr-preview'>...</div><input id='attrs-d3542941-fa67-4ffe-8bf2-c16a9263eb37' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-d3542941-fa67-4ffe-8bf2-c16a9263eb37' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-57c62f1a-f3eb-43d1-a69f-fca6666fedfc' class='xr-var-data-in' type='checkbox'><label for='data-57c62f1a-f3eb-43d1-a69f-fca6666fedfc' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>long_name :</span></dt><dd>4xDaily Air temperature at sigma level 995</dd><dt><span>units :</span></dt><dd>degK</dd><dt><span>precision :</span></dt><dd>2</dd><dt><span>GRIB_id :</span></dt><dd>11</dd><dt><span>GRIB_name :</span></dt><dd>TMP</dd><dt><span>var_desc :</span></dt><dd>Air temperature</dd><dt><span>dataset :</span></dt><dd>NMC Reanalysis</dd><dt><span>level_desc :</span></dt><dd>Surface</dd><dt><span>statistic :</span></dt><dd>Individual Obs</dd><dt><span>parent_stat :</span></dt><dd>Other</dd><dt><span>actual_range :</span></dt><dd>[185.16 322.1 ]</dd></dl></div><div class='xr-var-data'><pre>[3869000 values with dtype=float32]</pre></div></li></ul></div></li><li class='xr-section-item'><input id='section-d69f0756-55ba-4e13-831d-a1e89de74ea7' class='xr-section-summary-in' type='checkbox' checked><label for='section-d69f0756-55ba-4e13-831d-a1e89de74ea7' class='xr-section-summary' >Attributes: <span>(5)</span></label><div class='xr-section-inline-details'></div><div class='xr-section-details'><dl class='xr-attrs'><dt><span>Conventions :</span></dt><dd>COARDS</dd><dt><span>title :</span></dt><dd>4x daily NMC reanalysis (1948)</dd><dt><span>description :</span></dt><dd>Data is from NMC initialized reanalysis\n",
"(4x/day). These are the 0.9950 sigma level values.</dd><dt><span>platform :</span></dt><dd>Model</dd><dt><span>references :</span></dt><dd>http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html</dd></dl></div></li></ul></div></div>"
],
"text/plain": [
"<xarray.Dataset>\n",
"Dimensions: (lat: 25, lon: 53, time: 2920)\n",
"Coordinates:\n",
" * lat (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
" * lon (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
" * time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
"Data variables:\n",
" air (time, lat, lon) float32 ...\n",
"Attributes:\n",
" Conventions: COARDS\n",
" title: 4x daily NMC reanalysis (1948)\n",
" description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n",
" platform: Model\n",
" references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..."
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xr.open_dataset(f)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def ncfunc(f):\n",
" # open file\n",
" d = nc.Dataset(f, 'r')\n",
"\n",
" #open time variable and pull values\n",
" # find what the time (unlimited) dimension is\n",
" if 'time' in d.variables.keys():\n",
" times = d['time']\n",
" start = str(times[0])\n",
" end = str(times[-1])\n",
" date = start + \"-\" + end\n",
"\n",
" #go through the variables\n",
" var_list = []\n",
" # loop through all variables\n",
" for v in d.variables.keys():\n",
" # add all variables that are not coordinates to the catalog\n",
" if v not in list(dict(d.dimensions).keys()):\n",
" var_list.append(v)\n",
"\n",
" #go through attributes\n",
" attr_list = {}\n",
" for v in var_list:\n",
" if hasattr(d.variables[v], 'units'):\n",
" attr_list[v] = getattr(d.variables[v], 'units')\n",
"\n",
" #close file\n",
" # close netcdf file\n",
" d.close()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def xfunc(f):\n",
" # open file\n",
" d = xr.open_dataset(f, decode_times=True, use_cftime=True, chunks={})\n",
"\n",
" # get time variable\n",
" # find what the time (unlimited) dimension is\n",
" if 'time' in d.coords:\n",
" times = d['time']\n",
" start = times[0].dt.strftime('%Y-%m-%d').data.item()\n",
" end = times[-1].dt.strftime('%Y-%m-%d').data.item()\n",
" date = start + \"-\" + end\n",
"\n",
" # got through variable list\n",
" var_list = [] \n",
" # loop through all variables\n",
" for v in d.variables.keys():\n",
" # add all variables that are not coordinates to the catalog\n",
" if v not in d.coords:\n",
" var_list.append(v)\n",
"\n",
" #go through attr list\n",
" attr_list = {}\n",
" for v in var_list:\n",
" if hasattr(d.variables[v], 'units'):\n",
" attr_list[v] = getattr(d.variables[v], 'units')\n",
"\n",
" #close the file\n",
" # close netcdf file\n",
" d.close()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"%load_ext line_profiler"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 0.047982 s\n",
"File: <ipython-input-5-ddc292be8500>\n",
"Function: xfunc at line 1\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 1 def xfunc(f):\n",
" 2 # open file\n",
" 3 1 43118.0 43118.0 89.9 d = xr.open_dataset(f, decode_times=True, use_cftime=True, chunks={})\n",
" 4 \n",
" 5 # get time variable\n",
" 6 # find what the time (unlimited) dimension is\n",
" 7 1 19.0 19.0 0.0 if 'time' in d.coords:\n",
" 8 1 83.0 83.0 0.2 times = d['time']\n",
" 9 1 2536.0 2536.0 5.3 start = times[0].dt.strftime('%Y-%m-%d').data.item()\n",
" 10 1 2094.0 2094.0 4.4 end = times[-1].dt.strftime('%Y-%m-%d').data.item()\n",
" 11 1 3.0 3.0 0.0 date = start + \"-\" + end\n",
" 12 \n",
" 13 # got through variable list\n",
" 14 1 0.0 0.0 0.0 var_list = [] \n",
" 15 # loop through all variables\n",
" 16 5 15.0 3.0 0.0 for v in d.variables.keys():\n",
" 17 # add all variables that are not coordinates to the catalog\n",
" 18 4 20.0 5.0 0.0 if v not in d.coords:\n",
" 19 1 1.0 1.0 0.0 var_list.append(v)\n",
" 20 \n",
" 21 #go through attr list\n",
" 22 1 1.0 1.0 0.0 attr_list = {}\n",
" 23 2 2.0 1.0 0.0 for v in var_list:\n",
" 24 1 6.0 6.0 0.0 if hasattr(d.variables[v], 'units'):\n",
" 25 attr_list[v] = getattr(d.variables[v], 'units')\n",
" 26 \n",
" 27 #close the file\n",
" 28 # close netcdf file\n",
" 29 1 84.0 84.0 0.2 d.close()"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%lprun -f xfunc xfunc(f)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 0.007543 s\n",
"File: /glade/work/abanihi/devel/misc/blog-posts/api.py\n",
"Function: open_dataset at line 252\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 252 def open_dataset(\n",
" 253 filename_or_obj,\n",
" 254 group=None,\n",
" 255 decode_cf=True,\n",
" 256 mask_and_scale=None,\n",
" 257 decode_times=True,\n",
" 258 autoclose=None,\n",
" 259 concat_characters=True,\n",
" 260 decode_coords=True,\n",
" 261 engine=None,\n",
" 262 chunks=None,\n",
" 263 lock=None,\n",
" 264 cache=None,\n",
" 265 drop_variables=None,\n",
" 266 backend_kwargs=None,\n",
" 267 use_cftime=None,\n",
" 268 decode_timedelta=None,\n",
" 269 ):\n",
" 270 \"\"\"Open and decode a dataset from a file or file-like object.\n",
" 271 Parameters\n",
" 272 ----------\n",
" 273 filename_or_obj : str, Path, file or xarray.backends.*DataStore\n",
" 274 Strings and Path objects are interpreted as a path to a netCDF file\n",
" 275 or an OpenDAP URL and opened with python-netCDF4, unless the filename\n",
" 276 ends with .gz, in which case the file is gunzipped and opened with\n",
" 277 scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like\n",
" 278 objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).\n",
" 279 group : str, optional\n",
" 280 Path to the netCDF4 group in the given file to open (only works for\n",
" 281 netCDF4 files).\n",
" 282 decode_cf : bool, optional\n",
" 283 Whether to decode these variables, assuming they were saved according\n",
" 284 to CF conventions.\n",
" 285 mask_and_scale : bool, optional\n",
" 286 If True, replace array values equal to `_FillValue` with NA and scale\n",
" 287 values according to the formula `original_values * scale_factor +\n",
" 288 add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are\n",
" 289 taken from variable attributes (if they exist). If the `_FillValue` or\n",
" 290 `missing_value` attribute contains multiple values a warning will be\n",
" 291 issued and all array values matching one of the multiple values will\n",
" 292 be replaced by NA. mask_and_scale defaults to True except for the\n",
" 293 pseudonetcdf backend.\n",
" 294 decode_times : bool, optional\n",
" 295 If True, decode times encoded in the standard NetCDF datetime format\n",
" 296 into datetime objects. Otherwise, leave them encoded as numbers.\n",
" 297 autoclose : bool, optional\n",
" 298 If True, automatically close files to avoid OS Error of too many files\n",
" 299 being open. However, this option doesn't work with streams, e.g.,\n",
" 300 BytesIO.\n",
" 301 concat_characters : bool, optional\n",
" 302 If True, concatenate along the last dimension of character arrays to\n",
" 303 form string arrays. Dimensions will only be concatenated over (and\n",
" 304 removed) if they have no corresponding variable and if they are only\n",
" 305 used as the last dimension of character arrays.\n",
" 306 decode_coords : bool, optional\n",
" 307 If True, decode the 'coordinates' attribute to identify coordinates in\n",
" 308 the resulting dataset.\n",
" 309 engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib', \\\n",
" 310 'pseudonetcdf'}, optional\n",
" 311 Engine to use when reading files. If not provided, the default engine\n",
" 312 is chosen based on available dependencies, with a preference for\n",
" 313 'netcdf4'.\n",
" 314 chunks : int or dict, optional\n",
" 315 If chunks is provided, it used to load the new dataset into dask\n",
" 316 arrays. ``chunks={}`` loads the dataset with dask using a single\n",
" 317 chunk for all arrays.\n",
" 318 lock : False or duck threading.Lock, optional\n",
" 319 Resource lock to use when reading data from disk. Only relevant when\n",
" 320 using dask or another form of parallelism. By default, appropriate\n",
" 321 locks are chosen to safely read and write files with the currently\n",
" 322 active dask scheduler.\n",
" 323 cache : bool, optional\n",
" 324 If True, cache data loaded from the underlying datastore in memory as\n",
" 325 NumPy arrays when accessed to avoid reading from the underlying data-\n",
" 326 store multiple times. Defaults to True unless you specify the `chunks`\n",
" 327 argument to use dask, in which case it defaults to False. Does not\n",
" 328 change the behavior of coordinates corresponding to dimensions, which\n",
" 329 always load their data from disk into a ``pandas.Index``.\n",
" 330 drop_variables: string or iterable, optional\n",
" 331 A variable or list of variables to exclude from being parsed from the\n",
" 332 dataset. This may be useful to drop variables with problems or\n",
" 333 inconsistent values.\n",
" 334 backend_kwargs: dictionary, optional\n",
" 335 A dictionary of keyword arguments to pass on to the backend. This\n",
" 336 may be useful when backend options would improve performance or\n",
" 337 allow user control of dataset processing.\n",
" 338 use_cftime: bool, optional\n",
" 339 Only relevant if encoded dates come from a standard calendar\n",
" 340 (e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not\n",
" 341 specified). If None (default), attempt to decode times to\n",
" 342 ``np.datetime64[ns]`` objects; if this is not possible, decode times to\n",
" 343 ``cftime.datetime`` objects. If True, always decode times to\n",
" 344 ``cftime.datetime`` objects, regardless of whether or not they can be\n",
" 345 represented using ``np.datetime64[ns]`` objects. If False, always\n",
" 346 decode times to ``np.datetime64[ns]`` objects; if this is not possible\n",
" 347 raise an error.\n",
" 348 decode_timedelta : bool, optional\n",
" 349 If True, decode variables and coordinates with time units in\n",
" 350 {'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'}\n",
" 351 into timedelta objects. If False, leave them encoded as numbers.\n",
" 352 If None (default), assume the same value of decode_time.\n",
" 353 Returns\n",
" 354 -------\n",
" 355 dataset : Dataset\n",
" 356 The newly created dataset.\n",
" 357 Notes\n",
" 358 -----\n",
" 359 ``open_dataset`` opens the file with read-only access. When you modify\n",
" 360 values of a Dataset, even one linked to files on disk, only the in-memory\n",
" 361 copy you are manipulating in xarray is modified: the original file on disk\n",
" 362 is never touched.\n",
" 363 See Also\n",
" 364 --------\n",
" 365 open_mfdataset\n",
" 366 \"\"\"\n",
" 367 engines = [\n",
" 368 1 2.0 2.0 0.0 None,\n",
" 369 1 1.0 1.0 0.0 \"netcdf4\",\n",
" 370 1 1.0 1.0 0.0 \"scipy\",\n",
" 371 1 1.0 1.0 0.0 \"pydap\",\n",
" 372 1 1.0 1.0 0.0 \"h5netcdf\",\n",
" 373 1 1.0 1.0 0.0 \"pynio\",\n",
" 374 1 1.0 1.0 0.0 \"cfgrib\",\n",
" 375 1 0.0 0.0 0.0 \"pseudonetcdf\",\n",
" 376 ]\n",
" 377 1 1.0 1.0 0.0 if engine not in engines:\n",
" 378 raise ValueError(\n",
" 379 \"unrecognized engine for open_dataset: {}\\n\"\n",
" 380 \"must be one of: {}\".format(engine, engines)\n",
" 381 )\n",
" 382 \n",
" 383 1 1.0 1.0 0.0 if autoclose is not None:\n",
" 384 warnings.warn(\n",
" 385 \"The autoclose argument is no longer used by \"\n",
" 386 \"xarray.open_dataset() and is now ignored; it will be removed in \"\n",
" 387 \"a future version of xarray. If necessary, you can control the \"\n",
" 388 \"maximum number of simultaneous open files with \"\n",
" 389 \"xarray.set_options(file_cache_maxsize=...).\",\n",
" 390 FutureWarning,\n",
" 391 stacklevel=2,\n",
" 392 )\n",
" 393 \n",
" 394 1 0.0 0.0 0.0 if mask_and_scale is None:\n",
" 395 1 0.0 0.0 0.0 mask_and_scale = not engine == \"pseudonetcdf\"\n",
" 396 \n",
" 397 1 1.0 1.0 0.0 if not decode_cf:\n",
" 398 mask_and_scale = False\n",
" 399 decode_times = False\n",
" 400 concat_characters = False\n",
" 401 decode_coords = False\n",
" 402 decode_timedelta = False\n",
" 403 \n",
" 404 1 0.0 0.0 0.0 if cache is None:\n",
" 405 1 1.0 1.0 0.0 cache = chunks is None\n",
" 406 \n",
" 407 1 1.0 1.0 0.0 if backend_kwargs is None:\n",
" 408 1 0.0 0.0 0.0 backend_kwargs = {}\n",
" 409 \n",
" 410 1 2.0 2.0 0.0 if isinstance(filename_or_obj, Path):\n",
" 411 filename_or_obj = str(filename_or_obj)\n",
" 412 \n",
" 413 1 4.0 4.0 0.1 if isinstance(filename_or_obj, AbstractDataStore):\n",
" 414 store = filename_or_obj\n",
" 415 \n",
" 416 1 1.0 1.0 0.0 elif isinstance(filename_or_obj, str):\n",
" 417 1 55.0 55.0 0.7 filename_or_obj = _normalize_path(filename_or_obj)\n",
" 418 \n",
" 419 1 1.0 1.0 0.0 if engine is None:\n",
" 420 engine = _get_default_engine(filename_or_obj, allow_remote=True)\n",
" 421 1 1.0 1.0 0.0 if engine == \"netcdf4\":\n",
" 422 1 2.0 2.0 0.0 store = backends.NetCDF4DataStore.open(\n",
" 423 1 745.0 745.0 9.9 filename_or_obj, group=group, lock=lock, **backend_kwargs\n",
" 424 )\n",
" 425 \n",
" 426 elif engine == \"scipy\":\n",
" 427 store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)\n",
" 428 elif engine == \"pydap\":\n",
" 429 store = backends.PydapDataStore.open(filename_or_obj, **backend_kwargs)\n",
" 430 elif engine == \"h5netcdf\":\n",
" 431 store = backends.H5NetCDFStore.open(\n",
" 432 filename_or_obj, group=group, lock=lock, **backend_kwargs\n",
" 433 )\n",
" 434 elif engine == \"pynio\":\n",
" 435 store = backends.NioDataStore(filename_or_obj, lock=lock, **backend_kwargs)\n",
" 436 elif engine == \"pseudonetcdf\":\n",
" 437 store = backends.PseudoNetCDFDataStore.open(\n",
" 438 filename_or_obj, lock=lock, **backend_kwargs\n",
" 439 )\n",
" 440 elif engine == \"cfgrib\":\n",
" 441 store = backends.CfGribDataStore(\n",
" 442 filename_or_obj, lock=lock, **backend_kwargs\n",
" 443 )\n",
" 444 \n",
" 445 else:\n",
" 446 if engine not in [None, \"scipy\", \"h5netcdf\"]:\n",
" 447 raise ValueError(\n",
" 448 \"can only read bytes or file-like objects \"\n",
" 449 \"with engine='scipy' or 'h5netcdf'\"\n",
" 450 )\n",
" 451 engine = _get_engine_from_magic_number(filename_or_obj)\n",
" 452 if engine == \"scipy\":\n",
" 453 store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)\n",
" 454 elif engine == \"h5netcdf\":\n",
" 455 store = backends.H5NetCDFStore.open(\n",
" 456 filename_or_obj, group=group, lock=lock, **backend_kwargs\n",
" 457 )\n",
" 458 \n",
" 459 1 13.0 13.0 0.2 with close_on_error(store):\n",
" 460 1 6700.0 6700.0 88.8 ds = maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine)\n",
" 461 \n",
" 462 # Ensure source filename always stored in dataset object (GH issue #2550)\n",
" 463 1 3.0 3.0 0.0 if \"source\" not in ds.encoding:\n",
" 464 1 1.0 1.0 0.0 if isinstance(filename_or_obj, str):\n",
" 465 1 2.0 2.0 0.0 ds.encoding[\"source\"] = filename_or_obj\n",
" 466 \n",
" 467 1 0.0 0.0 0.0 return ds"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%lprun -s -f xr.open_dataset xr.open_dataset(f, engine='netcdf4')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 0.006774 s\n",
"File: /glade/work/abanihi/devel/misc/blog-posts/api.py\n",
"Function: maybe_decode_store at line 205\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 205 def maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine, lock=False):\n",
" 206 1 3.0 3.0 0.0 ds = conventions.decode_cf(\n",
" 207 1 1.0 1.0 0.0 store,\n",
" 208 1 1.0 1.0 0.0 mask_and_scale=mask_and_scale,\n",
" 209 1 1.0 1.0 0.0 decode_times=decode_times,\n",
" 210 1 1.0 1.0 0.0 concat_characters=concat_characters,\n",
" 211 1 1.0 1.0 0.0 decode_coords=decode_coords,\n",
" 212 1 0.0 0.0 0.0 drop_variables=drop_variables,\n",
" 213 1 0.0 0.0 0.0 use_cftime=use_cftime,\n",
" 214 1 6703.0 6703.0 99.0 decode_timedelta=decode_timedelta,\n",
" 215 )\n",
" 216 \n",
" 217 1 60.0 60.0 0.9 _protect_dataset_variables_inplace(ds, cache)\n",
" 218 \n",
" 219 1 1.0 1.0 0.0 if chunks is not None:\n",
" 220 from dask.base import tokenize\n",
" 221 \n",
" 222 # if passed an actual file path, augment the token with\n",
" 223 # the file modification time\n",
" 224 if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):\n",
" 225 mtime = os.path.getmtime(filename_or_obj)\n",
" 226 else:\n",
" 227 mtime = None\n",
" 228 token = tokenize(\n",
" 229 filename_or_obj,\n",
" 230 mtime,\n",
" 231 group,\n",
" 232 decode_cf,\n",
" 233 mask_and_scale,\n",
" 234 decode_times,\n",
" 235 concat_characters,\n",
" 236 decode_coords,\n",
" 237 engine,\n",
" 238 chunks,\n",
" 239 drop_variables,\n",
" 240 use_cftime,\n",
" 241 decode_timedelta,\n",
" 242 )\n",
" 243 name_prefix = \"open_dataset-%s\" % token\n",
" 244 ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)\n",
" 245 ds2._file_obj = ds._file_obj\n",
" 246 else:\n",
" 247 1 1.0 1.0 0.0 ds2 = ds\n",
" 248 \n",
" 249 1 1.0 1.0 0.0 return ds2"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%lprun -s -f maybe_decode_store xr.open_dataset(f, engine='netcdf4')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from xarray.conventions import decode_cf, decode_cf_variable\n",
"import xarray.conventions"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 0.00666 s\n",
"File: /glade/work/abanihi/softwares/miniconda3/envs/playground/lib/python3.7/site-packages/xarray/conventions.py\n",
"Function: decode_cf at line 517\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 517 def decode_cf(\n",
" 518 obj,\n",
" 519 concat_characters=True,\n",
" 520 mask_and_scale=True,\n",
" 521 decode_times=True,\n",
" 522 decode_coords=True,\n",
" 523 drop_variables=None,\n",
" 524 use_cftime=None,\n",
" 525 decode_timedelta=None,\n",
" 526 ):\n",
" 527 \"\"\"Decode the given Dataset or Datastore according to CF conventions into\n",
" 528 a new Dataset.\n",
" 529 \n",
" 530 Parameters\n",
" 531 ----------\n",
" 532 obj : Dataset or DataStore\n",
" 533 Object to decode.\n",
" 534 concat_characters : bool, optional\n",
" 535 Should character arrays be concatenated to strings, for\n",
" 536 example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'\n",
" 537 mask_and_scale: bool, optional\n",
" 538 Lazily scale (using scale_factor and add_offset) and mask\n",
" 539 (using _FillValue).\n",
" 540 decode_times : bool, optional\n",
" 541 Decode cf times (e.g., integers since 'hours since 2000-01-01') to\n",
" 542 np.datetime64.\n",
" 543 decode_coords : bool, optional\n",
" 544 Use the 'coordinates' attribute on variable (or the dataset itself) to\n",
" 545 identify coordinates.\n",
" 546 drop_variables: string or iterable, optional\n",
" 547 A variable or list of variables to exclude from being parsed from the\n",
" 548 dataset. This may be useful to drop variables with problems or\n",
" 549 inconsistent values.\n",
" 550 use_cftime: bool, optional\n",
" 551 Only relevant if encoded dates come from a standard calendar\n",
" 552 (e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not\n",
" 553 specified). If None (default), attempt to decode times to\n",
" 554 ``np.datetime64[ns]`` objects; if this is not possible, decode times to\n",
" 555 ``cftime.datetime`` objects. If True, always decode times to\n",
" 556 ``cftime.datetime`` objects, regardless of whether or not they can be\n",
" 557 represented using ``np.datetime64[ns]`` objects. If False, always\n",
" 558 decode times to ``np.datetime64[ns]`` objects; if this is not possible\n",
" 559 raise an error.\n",
" 560 decode_timedelta : bool, optional\n",
" 561 If True, decode variables and coordinates with time units in\n",
" 562 {'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'}\n",
" 563 into timedelta objects. If False, leave them encoded as numbers.\n",
" 564 If None (default), assume the same value of decode_time.\n",
" 565 \n",
" 566 Returns\n",
" 567 -------\n",
" 568 decoded : Dataset\n",
" 569 \"\"\"\n",
" 570 1 8.0 8.0 0.1 from .core.dataset import Dataset\n",
" 571 1 5.0 5.0 0.1 from .backends.common import AbstractDataStore\n",
" 572 \n",
" 573 1 4.0 4.0 0.1 if isinstance(obj, Dataset):\n",
" 574 vars = obj._variables\n",
" 575 attrs = obj.attrs\n",
" 576 extra_coords = set(obj.coords)\n",
" 577 file_obj = obj._file_obj\n",
" 578 encoding = obj.encoding\n",
" 579 1 2.0 2.0 0.0 elif isinstance(obj, AbstractDataStore):\n",
" 580 1 855.0 855.0 12.8 vars, attrs = obj.load()\n",
" 581 1 2.0 2.0 0.0 extra_coords = set()\n",
" 582 1 0.0 0.0 0.0 file_obj = obj\n",
" 583 1 37.0 37.0 0.6 encoding = obj.get_encoding()\n",
" 584 else:\n",
" 585 raise TypeError(\"can only decode Dataset or DataStore objects\")\n",
" 586 \n",
" 587 1 0.0 0.0 0.0 vars, attrs, coord_names = decode_cf_variables(\n",
" 588 1 1.0 1.0 0.0 vars,\n",
" 589 1 0.0 0.0 0.0 attrs,\n",
" 590 1 0.0 0.0 0.0 concat_characters,\n",
" 591 1 1.0 1.0 0.0 mask_and_scale,\n",
" 592 1 1.0 1.0 0.0 decode_times,\n",
" 593 1 0.0 0.0 0.0 decode_coords,\n",
" 594 1 1.0 1.0 0.0 drop_variables=drop_variables,\n",
" 595 1 1.0 1.0 0.0 use_cftime=use_cftime,\n",
" 596 1 2698.0 2698.0 40.5 decode_timedelta=decode_timedelta,\n",
" 597 )\n",
" 598 1 2863.0 2863.0 43.0 ds = Dataset(vars, attrs=attrs)\n",
" 599 1 175.0 175.0 2.6 ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))\n",
" 600 1 2.0 2.0 0.0 ds._file_obj = file_obj\n",
" 601 1 4.0 4.0 0.1 ds.encoding = encoding\n",
" 602 \n",
" 603 1 0.0 0.0 0.0 return ds"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%lprun -s -f decode_cf xr.open_dataset(f, engine='netcdf4')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 0.002416 s\n",
"File: /glade/work/abanihi/softwares/miniconda3/envs/playground/lib/python3.7/site-packages/xarray/conventions.py\n",
"Function: decode_cf_variable at line 260\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 260 def decode_cf_variable(\n",
" 261 name,\n",
" 262 var,\n",
" 263 concat_characters=True,\n",
" 264 mask_and_scale=True,\n",
" 265 decode_times=True,\n",
" 266 decode_endianness=True,\n",
" 267 stack_char_dim=True,\n",
" 268 use_cftime=None,\n",
" 269 decode_timedelta=None,\n",
" 270 ):\n",
" 271 \"\"\"\n",
" 272 Decodes a variable which may hold CF encoded information.\n",
" 273 \n",
" 274 This includes variables that have been masked and scaled, which\n",
" 275 hold CF style time variables (this is almost always the case if\n",
" 276 the dataset has been serialized) and which have strings encoded\n",
" 277 as character arrays.\n",
" 278 \n",
" 279 Parameters\n",
" 280 ----------\n",
" 281 name: str\n",
" 282 Name of the variable. Used for better error messages.\n",
" 283 var : Variable\n",
" 284 A variable holding potentially CF encoded information.\n",
" 285 concat_characters : bool\n",
" 286 Should character arrays be concatenated to strings, for\n",
" 287 example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'\n",
" 288 mask_and_scale: bool\n",
" 289 Lazily scale (using scale_factor and add_offset) and mask\n",
" 290 (using _FillValue). If the _Unsigned attribute is present\n",
" 291 treat integer arrays as unsigned.\n",
" 292 decode_times : bool\n",
" 293 Decode cf times ('hours since 2000-01-01') to np.datetime64.\n",
" 294 decode_endianness : bool\n",
" 295 Decode arrays from non-native to native endianness.\n",
" 296 stack_char_dim : bool\n",
" 297 Whether to stack characters into bytes along the last dimension of this\n",
" 298 array. Passed as an argument because we need to look at the full\n",
" 299 dataset to figure out if this is appropriate.\n",
" 300 use_cftime: bool, optional\n",
" 301 Only relevant if encoded dates come from a standard calendar\n",
" 302 (e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not\n",
" 303 specified). If None (default), attempt to decode times to\n",
" 304 ``np.datetime64[ns]`` objects; if this is not possible, decode times to\n",
" 305 ``cftime.datetime`` objects. If True, always decode times to\n",
" 306 ``cftime.datetime`` objects, regardless of whether or not they can be\n",
" 307 represented using ``np.datetime64[ns]`` objects. If False, always\n",
" 308 decode times to ``np.datetime64[ns]`` objects; if this is not possible\n",
" 309 raise an error.\n",
" 310 \n",
" 311 Returns\n",
" 312 -------\n",
" 313 out : Variable\n",
" 314 A variable holding the decoded equivalent of var.\n",
" 315 \"\"\"\n",
" 316 4 137.0 34.2 5.7 var = as_variable(var)\n",
" 317 4 6.0 1.5 0.2 original_dtype = var.dtype\n",
" 318 \n",
" 319 4 4.0 1.0 0.2 if decode_timedelta is None:\n",
" 320 4 4.0 1.0 0.2 decode_timedelta = decode_times\n",
" 321 \n",
" 322 4 3.0 0.8 0.1 if concat_characters:\n",
" 323 4 2.0 0.5 0.1 if stack_char_dim:\n",
" 324 var = strings.CharacterArrayCoder().decode(var, name=name)\n",
" 325 4 77.0 19.2 3.2 var = strings.EncodedStringCoder().decode(var)\n",
" 326 \n",
" 327 4 1.0 0.2 0.0 if mask_and_scale:\n",
" 328 4 1.0 0.2 0.0 for coder in [\n",
" 329 4 4.0 1.0 0.2 variables.UnsignedIntegerCoder(),\n",
" 330 4 4.0 1.0 0.2 variables.CFMaskCoder(),\n",
" 331 16 12.0 0.8 0.5 variables.CFScaleOffsetCoder(),\n",
" 332 ]:\n",
" 333 12 362.0 30.2 15.0 var = coder.decode(var, name=name)\n",
" 334 \n",
" 335 4 4.0 1.0 0.2 if decode_timedelta:\n",
" 336 4 58.0 14.5 2.4 var = times.CFTimedeltaCoder().decode(var, name=name)\n",
" 337 4 3.0 0.8 0.1 if decode_times:\n",
" 338 4 1607.0 401.8 66.5 var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)\n",
" 339 \n",
" 340 4 13.0 3.2 0.5 dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)\n",
" 341 # TODO(shoyer): convert everything below to use coders\n",
" 342 \n",
" 343 4 6.0 1.5 0.2 if decode_endianness and not data.dtype.isnative:\n",
" 344 # do this last, so it's only done if we didn't already unmask/scale\n",
" 345 data = NativeEndiannessArray(data)\n",
" 346 original_dtype = data.dtype\n",
" 347 \n",
" 348 4 4.0 1.0 0.2 encoding.setdefault(\"dtype\", original_dtype)\n",
" 349 \n",
" 350 4 2.0 0.5 0.1 if \"dtype\" in attributes and attributes[\"dtype\"] == \"bool\":\n",
" 351 del attributes[\"dtype\"]\n",
" 352 data = BoolTypeArray(data)\n",
" 353 \n",
" 354 4 3.0 0.8 0.1 if not isinstance(data, dask_array_type):\n",
" 355 4 45.0 11.2 1.9 data = indexing.LazilyOuterIndexedArray(data)\n",
" 356 \n",
" 357 4 54.0 13.5 2.2 return Variable(dimensions, data, attributes, encoding=encoding)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%lprun -s -f xarray.conventions.decode_cf_variable xr.open_dataset(f, engine='netcdf4')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment