andersy005 · July 18, 2020 05:06
diff --git a/api.py b/api.py
 import os.path
 import warnings
 from glob import glob
 from io import BytesIO
 from numbers import Number
 from pathlib import Path
 from xarray.backends.common import AbstractDataStore, ArrayWriter
 from xarray.core.utils import close_on_error, is_grib_path, is_remote_uri
 from xarray.core import indexing
 from xarray import backends, conventions, coding


 def _get_default_engine_remote_uri():
    try:
        import netCDF4  # noqa: F401

        engine = "netcdf4"
    except ImportError:  # pragma: no cover
        try:
            import pydap  # noqa: F401

            engine = "pydap"
        except ImportError:
            raise ValueError(
                "netCDF4 or pydap is required for accessing "
                "remote datasets via OPeNDAP"
            )
    return engine


 def _get_default_engine_grib():
    msgs = []
    try:
        import Nio  # noqa: F401

        msgs += ["set engine='pynio' to access GRIB files with PyNIO"]
    except ImportError:  # pragma: no cover
        pass
    try:
        import cfgrib  # noqa: F401

        msgs += ["set engine='cfgrib' to access GRIB files with cfgrib"]
    except ImportError:  # pragma: no cover
        pass
    if msgs:
        raise ValueError(" or\n".join(msgs))
    else:
        raise ValueError("PyNIO or cfgrib is required for accessing " "GRIB files")


 def _get_default_engine_gz():
    try:
        import scipy  # noqa: F401

        engine = "scipy"
    except ImportError:  # pragma: no cover
        raise ValueError("scipy is required for accessing .gz files")
    return engine


 def _get_default_engine_netcdf():
    try:
        import netCDF4  # noqa: F401

        engine = "netcdf4"
    except ImportError:  # pragma: no cover
        try:
            import scipy.io.netcdf  # noqa: F401

            engine = "scipy"
        except ImportError:
            raise ValueError(
                "cannot read or write netCDF files without "
                "netCDF4-python or scipy installed"
            )
    return engine


 def _get_engine_from_magic_number(filename_or_obj):
    # check byte header to determine file type
    if isinstance(filename_or_obj, bytes):
        magic_number = filename_or_obj[:8]
    else:
        if filename_or_obj.tell() != 0:
            raise ValueError(
                "file-like object read/write pointer not at zero "
                "please close and reopen, or use a context "
                "manager"
            )
        magic_number = filename_or_obj.read(8)
        filename_or_obj.seek(0)

    if magic_number.startswith(b"CDF"):
        engine = "scipy"
    elif magic_number.startswith(b"\211HDF\r\n\032\n"):
        engine = "h5netcdf"
        if isinstance(filename_or_obj, bytes):
            raise ValueError(
                "can't open netCDF4/HDF5 as bytes "
                "try passing a path or file-like object"
            )
    else:
        if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
            filename_or_obj = filename_or_obj[:80] + b"..."
        raise ValueError(
            "{} is not a valid netCDF file "
            "did you mean to pass a string for a path instead?".format(filename_or_obj)
        )
    return engine


 def _get_default_engine(path, allow_remote=False):
    if allow_remote and is_remote_uri(path):
        engine = _get_default_engine_remote_uri()
    elif is_grib_path(path):
        engine = _get_default_engine_grib()
    elif path.endswith(".gz"):
        engine = _get_default_engine_gz()
    else:
        engine = _get_default_engine_netcdf()
    return engine


 def _normalize_path(path):
    if is_remote_uri(path):
        return path
    else:
        return os.path.abspath(os.path.expanduser(path))
    

 def _validate_dataset_names(dataset):
    """DataArray.name and Dataset keys must be a string or None"""

    def check_name(name):
        if isinstance(name, str):
            if not name:
                raise ValueError(
                    "Invalid name for DataArray or Dataset key: "
                    "string must be length 1 or greater for "
                    "serialization to netCDF files"
                )
        elif name is not None:
            raise TypeError(
                "DataArray.name or Dataset key must be either a "
                "string or None for serialization to netCDF files"
            )

    for k in dataset.variables:
        check_name(k)


 def _validate_attrs(dataset):
    """`attrs` must have a string key and a value which is either: a number,
    a string, an ndarray or a list/tuple of numbers/strings.
    """

    def check_attr(name, value):
        if isinstance(name, str):
            if not name:
                raise ValueError(
                    "Invalid name for attr: string must be "
                    "length 1 or greater for serialization to "
                    "netCDF files"
                )
        else:
            raise TypeError(
                "Invalid name for attr: {} must be a string for "
                "serialization to netCDF files".format(name)
            )

        if not isinstance(value, (str, Number, np.ndarray, np.number, list, tuple)):
            raise TypeError(
                "Invalid value for attr: {} must be a number, "
                "a string, an ndarray or a list/tuple of "
                "numbers/strings for serialization to netCDF "
                "files".format(value)
            )

    # Check attrs on the dataset itself
    for k, v in dataset.attrs.items():
        check_attr(k, v)

    # Check attrs on each variable within the dataset
    for variable in dataset.variables.values():
        for k, v in variable.attrs.items():
            check_attr(k, v)


 def _protect_dataset_variables_inplace(dataset, cache):
    for name, variable in dataset.variables.items():
        if name not in variable.dims:
            # no need to protect IndexVariable objects
            data = indexing.CopyOnWriteArray(variable._data)
            if cache:
                data = indexing.MemoryCachedArray(data)
            variable.data = data


 def _finalize_store(write, store):
    """ Finalize this store by explicitly syncing and closing"""
    del write  # ensure writing is done first
    store.close()

    
 def maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine, lock=False):
    ds = conventions.decode_cf(
            store,
            mask_and_scale=mask_and_scale,
            decode_times=decode_times,
            concat_characters=concat_characters,
            decode_coords=decode_coords,
            drop_variables=drop_variables,
            use_cftime=use_cftime,
            decode_timedelta=decode_timedelta,
        )

    _protect_dataset_variables_inplace(ds, cache)

    if chunks is not None:
        from dask.base import tokenize

        # if passed an actual file path, augment the token with
        # the file modification time
        if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
            mtime = os.path.getmtime(filename_or_obj)
        else:
            mtime = None
        token = tokenize(
            filename_or_obj,
            mtime,
            group,
            decode_cf,
            mask_and_scale,
            decode_times,
            concat_characters,
            decode_coords,
            engine,
            chunks,
            drop_variables,
            use_cftime,
            decode_timedelta,
        )
        name_prefix = "open_dataset-%s" % token
        ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
        ds2._file_obj = ds._file_obj
    else:
        ds2 = ds

    return ds2

            
 def open_dataset(
    filename_or_obj,
    group=None,
    decode_cf=True,
    mask_and_scale=None,
    decode_times=True,
    autoclose=None,
    concat_characters=True,
    decode_coords=True,
    engine=None,
    chunks=None,
    lock=None,
    cache=None,
    drop_variables=None,
    backend_kwargs=None,
    use_cftime=None,
    decode_timedelta=None,
 ):
    """Open and decode a dataset from a file or file-like object.
    Parameters
    ----------
    filename_or_obj : str, Path, file or xarray.backends.*DataStore
        Strings and Path objects are interpreted as a path to a netCDF file
        or an OpenDAP URL and opened with python-netCDF4, unless the filename
        ends with .gz, in which case the file is gunzipped and opened with
        scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
        objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
    group : str, optional
        Path to the netCDF4 group in the given file to open (only works for
        netCDF4 files).
    decode_cf : bool, optional
        Whether to decode these variables, assuming they were saved according
        to CF conventions.
    mask_and_scale : bool, optional
        If True, replace array values equal to `_FillValue` with NA and scale
        values according to the formula `original_values * scale_factor +
        add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
        taken from variable attributes (if they exist).  If the `_FillValue` or
        `missing_value` attribute contains multiple values a warning will be
        issued and all array values matching one of the multiple values will
        be replaced by NA. mask_and_scale defaults to True except for the
        pseudonetcdf backend.
    decode_times : bool, optional
        If True, decode times encoded in the standard NetCDF datetime format
        into datetime objects. Otherwise, leave them encoded as numbers.
    autoclose : bool, optional
        If True, automatically close files to avoid OS Error of too many files
        being open.  However, this option doesn't work with streams, e.g.,
        BytesIO.
    concat_characters : bool, optional
        If True, concatenate along the last dimension of character arrays to
        form string arrays. Dimensions will only be concatenated over (and
        removed) if they have no corresponding variable and if they are only
        used as the last dimension of character arrays.
    decode_coords : bool, optional
        If True, decode the 'coordinates' attribute to identify coordinates in
        the resulting dataset.
    engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib', \
        'pseudonetcdf'}, optional
        Engine to use when reading files. If not provided, the default engine
        is chosen based on available dependencies, with a preference for
        'netcdf4'.
    chunks : int or dict, optional
        If chunks is provided, it used to load the new dataset into dask
        arrays. ``chunks={}`` loads the dataset with dask using a single
        chunk for all arrays.
    lock : False or duck threading.Lock, optional
        Resource lock to use when reading data from disk. Only relevant when
        using dask or another form of parallelism. By default, appropriate
        locks are chosen to safely read and write files with the currently
        active dask scheduler.
    cache : bool, optional
        If True, cache data loaded from the underlying datastore in memory as
        NumPy arrays when accessed to avoid reading from the underlying data-
        store multiple times. Defaults to True unless you specify the `chunks`
        argument to use dask, in which case it defaults to False. Does not
        change the behavior of coordinates corresponding to dimensions, which
        always load their data from disk into a ``pandas.Index``.
    drop_variables: string or iterable, optional
        A variable or list of variables to exclude from being parsed from the
        dataset. This may be useful to drop variables with problems or
        inconsistent values.
    backend_kwargs: dictionary, optional
        A dictionary of keyword arguments to pass on to the backend. This
        may be useful when backend options would improve performance or
        allow user control of dataset processing.
    use_cftime: bool, optional
        Only relevant if encoded dates come from a standard calendar
        (e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
        specified).  If None (default), attempt to decode times to
        ``np.datetime64[ns]`` objects; if this is not possible, decode times to
        ``cftime.datetime`` objects. If True, always decode times to
        ``cftime.datetime`` objects, regardless of whether or not they can be
        represented using ``np.datetime64[ns]`` objects.  If False, always
        decode times to ``np.datetime64[ns]`` objects; if this is not possible
        raise an error.
    decode_timedelta : bool, optional
        If True, decode variables and coordinates with time units in
        {'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'}
        into timedelta objects. If False, leave them encoded as numbers.
        If None (default), assume the same value of decode_time.
    Returns
    -------
    dataset : Dataset
        The newly created dataset.
    Notes
    -----
    ``open_dataset`` opens the file with read-only access. When you modify
    values of a Dataset, even one linked to files on disk, only the in-memory
    copy you are manipulating in xarray is modified: the original file on disk
    is never touched.
    See Also
    --------
    open_mfdataset
    """
    engines = [
        None,
        "netcdf4",
        "scipy",
        "pydap",
        "h5netcdf",
        "pynio",
        "cfgrib",
        "pseudonetcdf",
    ]
    if engine not in engines:
        raise ValueError(
            "unrecognized engine for open_dataset: {}\n"
            "must be one of: {}".format(engine, engines)
        )

    if autoclose is not None:
        warnings.warn(
            "The autoclose argument is no longer used by "
            "xarray.open_dataset() and is now ignored; it will be removed in "
            "a future version of xarray. If necessary, you can control the "
            "maximum number of simultaneous open files with "
            "xarray.set_options(file_cache_maxsize=...).",
            FutureWarning,
            stacklevel=2,
        )

    if mask_and_scale is None:
        mask_and_scale = not engine == "pseudonetcdf"

    if not decode_cf:
        mask_and_scale = False
        decode_times = False
        concat_characters = False
        decode_coords = False
        decode_timedelta = False

    if cache is None:
        cache = chunks is None

    if backend_kwargs is None:
        backend_kwargs = {}
        
    if isinstance(filename_or_obj, Path):
        filename_or_obj = str(filename_or_obj)

    if isinstance(filename_or_obj, AbstractDataStore):
        store = filename_or_obj

    elif isinstance(filename_or_obj, str):
        filename_or_obj = _normalize_path(filename_or_obj)

        if engine is None:
            engine = _get_default_engine(filename_or_obj, allow_remote=True)
        if engine == "netcdf4":
            store = backends.NetCDF4DataStore.open(
                filename_or_obj, group=group, lock=lock, **backend_kwargs
            )
            
        elif engine == "scipy":
            store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
        elif engine == "pydap":
            store = backends.PydapDataStore.open(filename_or_obj, **backend_kwargs)
        elif engine == "h5netcdf":
            store = backends.H5NetCDFStore.open(
                filename_or_obj, group=group, lock=lock, **backend_kwargs
            )
        elif engine == "pynio":
            store = backends.NioDataStore(filename_or_obj, lock=lock, **backend_kwargs)
        elif engine == "pseudonetcdf":
            store = backends.PseudoNetCDFDataStore.open(
                filename_or_obj, lock=lock, **backend_kwargs
            )
        elif engine == "cfgrib":
            store = backends.CfGribDataStore(
                filename_or_obj, lock=lock, **backend_kwargs
            )

    else:
        if engine not in [None, "scipy", "h5netcdf"]:
            raise ValueError(
                "can only read bytes or file-like objects "
                "with engine='scipy' or 'h5netcdf'"
            )
        engine = _get_engine_from_magic_number(filename_or_obj)
        if engine == "scipy":
            store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
        elif engine == "h5netcdf":
            store = backends.H5NetCDFStore.open(
                filename_or_obj, group=group, lock=lock, **backend_kwargs
            )
        
    with close_on_error(store):
        ds = maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine)

    # Ensure source filename always stored in dataset object (GH issue #2550)
    if "source" not in ds.encoding:
        if isinstance(filename_or_obj, str):
            ds.encoding["source"] = filename_or_obj

    return ds
diff --git a/nc-xarray-profiling.ipynb b/nc-xarray-profiling.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xarray as xr\n",
    "import netCDF4 as nc\n",
    "from netCDF4 import Dataset\n",
    "from api import open_dataset, maybe_decode_store\n",
    "xr.open_dataset = open_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = \"~/.xarray_tutorial_data/air_temperature.nc\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><svg style=\"position: absolute; width: 0; height: 0; overflow: hidden\">\n",
       "<defs>\n",
       "<symbol id=\"icon-database\" viewBox=\"0 0 32 32\">\n",
       "<path d=\"M16 0c-8.837 0-16 2.239-16 5v4c0 2.761 7.163 5 16 5s16-2.239 16-5v-4c0-2.761-7.163-5-16-5z\"></path>\n",
       "<path d=\"M16 17c-8.837 0-16-2.239-16-5v6c0 2.761 7.163 5 16 5s16-2.239 16-5v-6c0 2.761-7.163 5-16 5z\"></path>\n",
       "<path d=\"M16 26c-8.837 0-16-2.239-16-5v6c0 2.761 7.163 5 16 5s16-2.239 16-5v-6c0 2.761-7.163 5-16 5z\"></path>\n",
       "</symbol>\n",
       "<symbol id=\"icon-file-text2\" viewBox=\"0 0 32 32\">\n",
       "<path d=\"M28.681 7.159c-0.694-0.947-1.662-2.053-2.724-3.116s-2.169-2.030-3.116-2.724c-1.612-1.182-2.393-1.319-2.841-1.319h-15.5c-1.378 0-2.5 1.121-2.5 2.5v27c0 1.378 1.122 2.5 2.5 2.5h23c1.378 0 2.5-1.122 2.5-2.5v-19.5c0-0.448-0.137-1.23-1.319-2.841zM24.543 5.457c0.959 0.959 1.712 1.825 2.268 2.543h-4.811v-4.811c0.718 0.556 1.584 1.309 2.543 2.268zM28 29.5c0 0.271-0.229 0.5-0.5 0.5h-23c-0.271 0-0.5-0.229-0.5-0.5v-27c0-0.271 0.229-0.5 0.5-0.5 0 0 15.499-0 15.5 0v7c0 0.552 0.448 1 1 1h7v19.5z\"></path>\n",
       "<path d=\"M23 26h-14c-0.552 0-1-0.448-1-1s0.448-1 1-1h14c0.552 0 1 0.448 1 1s-0.448 1-1 1z\"></path>\n",
       "<path d=\"M23 22h-14c-0.552 0-1-0.448-1-1s0.448-1 1-1h14c0.552 0 1 0.448 1 1s-0.448 1-1 1z\"></path>\n",
       "<path d=\"M23 18h-14c-0.552 0-1-0.448-1-1s0.448-1 1-1h14c0.552 0 1 0.448 1 1s-0.448 1-1 1z\"></path>\n",
       "</symbol>\n",
       "</defs>\n",
       "</svg>\n",
       "<style>/* CSS stylesheet for displaying xarray objects in jupyterlab.\n",
       " *\n",
       " */\n",
       "\n",
       ":root {\n",
       "  --xr-font-color0: var(--jp-content-font-color0, rgba(0, 0, 0, 1));\n",
       "  --xr-font-color2: var(--jp-content-font-color2, rgba(0, 0, 0, 0.54));\n",
       "  --xr-font-color3: var(--jp-content-font-color3, rgba(0, 0, 0, 0.38));\n",
       "  --xr-border-color: var(--jp-border-color2, #e0e0e0);\n",
       "  --xr-disabled-color: var(--jp-layout-color3, #bdbdbd);\n",
       "  --xr-background-color: var(--jp-layout-color0, white);\n",
       "  --xr-background-color-row-even: var(--jp-layout-color1, white);\n",
       "  --xr-background-color-row-odd: var(--jp-layout-color2, #eeeeee);\n",
       "}\n",
       "\n",
       "html[theme=dark],\n",
       "body.vscode-dark {\n",
       "  --xr-font-color0: rgba(255, 255, 255, 1);\n",
       "  --xr-font-color2: rgba(255, 255, 255, 0.54);\n",
       "  --xr-font-color3: rgba(255, 255, 255, 0.38);\n",
       "  --xr-border-color: #1F1F1F;\n",
       "  --xr-disabled-color: #515151;\n",
       "  --xr-background-color: #111111;\n",
       "  --xr-background-color-row-even: #111111;\n",
       "  --xr-background-color-row-odd: #313131;\n",
       "}\n",
       "\n",
       ".xr-wrap {\n",
       "  display: block;\n",
       "  min-width: 300px;\n",
       "  max-width: 700px;\n",
       "}\n",
       "\n",
       ".xr-text-repr-fallback {\n",
       "  /* fallback to plain text repr when CSS is not injected (untrusted notebook) */\n",
       "  display: none;\n",
       "}\n",
       "\n",
       ".xr-header {\n",
       "  padding-top: 6px;\n",
       "  padding-bottom: 6px;\n",
       "  margin-bottom: 4px;\n",
       "  border-bottom: solid 1px var(--xr-border-color);\n",
       "}\n",
       "\n",
       ".xr-header > div,\n",
       ".xr-header > ul {\n",
       "  display: inline;\n",
       "  margin-top: 0;\n",
       "  margin-bottom: 0;\n",
       "}\n",
       "\n",
       ".xr-obj-type,\n",
       ".xr-array-name {\n",
       "  margin-left: 2px;\n",
       "  margin-right: 10px;\n",
       "}\n",
       "\n",
       ".xr-obj-type {\n",
       "  color: var(--xr-font-color2);\n",
       "}\n",
       "\n",
       ".xr-sections {\n",
       "  padding-left: 0 !important;\n",
       "  display: grid;\n",
       "  grid-template-columns: 150px auto auto 1fr 20px 20px;\n",
       "}\n",
       "\n",
       ".xr-section-item {\n",
       "  display: contents;\n",
       "}\n",
       "\n",
       ".xr-section-item input {\n",
       "  display: none;\n",
       "}\n",
       "\n",
       ".xr-section-item input + label {\n",
       "  color: var(--xr-disabled-color);\n",
       "}\n",
       "\n",
       ".xr-section-item input:enabled + label {\n",
       "  cursor: pointer;\n",
       "  color: var(--xr-font-color2);\n",
       "}\n",
       "\n",
       ".xr-section-item input:enabled + label:hover {\n",
       "  color: var(--xr-font-color0);\n",
       "}\n",
       "\n",
       ".xr-section-summary {\n",
       "  grid-column: 1;\n",
       "  color: var(--xr-font-color2);\n",
       "  font-weight: 500;\n",
       "}\n",
       "\n",
       ".xr-section-summary > span {\n",
       "  display: inline-block;\n",
       "  padding-left: 0.5em;\n",
       "}\n",
       "\n",
       ".xr-section-summary-in:disabled + label {\n",
       "  color: var(--xr-font-color2);\n",
       "}\n",
       "\n",
       ".xr-section-summary-in + label:before {\n",
       "  display: inline-block;\n",
       "  content: '►';\n",
       "  font-size: 11px;\n",
       "  width: 15px;\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       ".xr-section-summary-in:disabled + label:before {\n",
       "  color: var(--xr-disabled-color);\n",
       "}\n",
       "\n",
       ".xr-section-summary-in:checked + label:before {\n",
       "  content: '▼';\n",
       "}\n",
       "\n",
       ".xr-section-summary-in:checked + label > span {\n",
       "  display: none;\n",
       "}\n",
       "\n",
       ".xr-section-summary,\n",
       ".xr-section-inline-details {\n",
       "  padding-top: 4px;\n",
       "  padding-bottom: 4px;\n",
       "}\n",
       "\n",
       ".xr-section-inline-details {\n",
       "  grid-column: 2 / -1;\n",
       "}\n",
       "\n",
       ".xr-section-details {\n",
       "  display: none;\n",
       "  grid-column: 1 / -1;\n",
       "  margin-bottom: 5px;\n",
       "}\n",
       "\n",
       ".xr-section-summary-in:checked ~ .xr-section-details {\n",
       "  display: contents;\n",
       "}\n",
       "\n",
       ".xr-array-wrap {\n",
       "  grid-column: 1 / -1;\n",
       "  display: grid;\n",
       "  grid-template-columns: 20px auto;\n",
       "}\n",
       "\n",
       ".xr-array-wrap > label {\n",
       "  grid-column: 1;\n",
       "  vertical-align: top;\n",
       "}\n",
       "\n",
       ".xr-preview {\n",
       "  color: var(--xr-font-color3);\n",
       "}\n",
       "\n",
       ".xr-array-preview,\n",
       ".xr-array-data {\n",
       "  padding: 0 5px !important;\n",
       "  grid-column: 2;\n",
       "}\n",
       "\n",
       ".xr-array-data,\n",
       ".xr-array-in:checked ~ .xr-array-preview {\n",
       "  display: none;\n",
       "}\n",
       "\n",
       ".xr-array-in:checked ~ .xr-array-data,\n",
       ".xr-array-preview {\n",
       "  display: inline-block;\n",
       "}\n",
       "\n",
       ".xr-dim-list {\n",
       "  display: inline-block !important;\n",
       "  list-style: none;\n",
       "  padding: 0 !important;\n",
       "  margin: 0;\n",
       "}\n",
       "\n",
       ".xr-dim-list li {\n",
       "  display: inline-block;\n",
       "  padding: 0;\n",
       "  margin: 0;\n",
       "}\n",
       "\n",
       ".xr-dim-list:before {\n",
       "  content: '(';\n",
       "}\n",
       "\n",
       ".xr-dim-list:after {\n",
       "  content: ')';\n",
       "}\n",
       "\n",
       ".xr-dim-list li:not(:last-child):after {\n",
       "  content: ',';\n",
       "  padding-right: 5px;\n",
       "}\n",
       "\n",
       ".xr-has-index {\n",
       "  font-weight: bold;\n",
       "}\n",
       "\n",
       ".xr-var-list,\n",
       ".xr-var-item {\n",
       "  display: contents;\n",
       "}\n",
       "\n",
       ".xr-var-item > div,\n",
       ".xr-var-item label,\n",
       ".xr-var-item > .xr-var-name span {\n",
       "  background-color: var(--xr-background-color-row-even);\n",
       "  margin-bottom: 0;\n",
       "}\n",
       "\n",
       ".xr-var-item > .xr-var-name:hover span {\n",
       "  padding-right: 5px;\n",
       "}\n",
       "\n",
       ".xr-var-list > li:nth-child(odd) > div,\n",
       ".xr-var-list > li:nth-child(odd) > label,\n",
       ".xr-var-list > li:nth-child(odd) > .xr-var-name span {\n",
       "  background-color: var(--xr-background-color-row-odd);\n",
       "}\n",
       "\n",
       ".xr-var-name {\n",
       "  grid-column: 1;\n",
       "}\n",
       "\n",
       ".xr-var-dims {\n",
       "  grid-column: 2;\n",
       "}\n",
       "\n",
       ".xr-var-dtype {\n",
       "  grid-column: 3;\n",
       "  text-align: right;\n",
       "  color: var(--xr-font-color2);\n",
       "}\n",
       "\n",
       ".xr-var-preview {\n",
       "  grid-column: 4;\n",
       "}\n",
       "\n",
       ".xr-var-name,\n",
       ".xr-var-dims,\n",
       ".xr-var-dtype,\n",
       ".xr-preview,\n",
       ".xr-attrs dt {\n",
       "  white-space: nowrap;\n",
       "  overflow: hidden;\n",
       "  text-overflow: ellipsis;\n",
       "  padding-right: 10px;\n",
       "}\n",
       "\n",
       ".xr-var-name:hover,\n",
       ".xr-var-dims:hover,\n",
       ".xr-var-dtype:hover,\n",
       ".xr-attrs dt:hover {\n",
       "  overflow: visible;\n",
       "  width: auto;\n",
       "  z-index: 1;\n",
       "}\n",
       "\n",
       ".xr-var-attrs,\n",
       ".xr-var-data {\n",
       "  display: none;\n",
       "  background-color: var(--xr-background-color) !important;\n",
       "  padding-bottom: 5px !important;\n",
       "}\n",
       "\n",
       ".xr-var-attrs-in:checked ~ .xr-var-attrs,\n",
       ".xr-var-data-in:checked ~ .xr-var-data {\n",
       "  display: block;\n",
       "}\n",
       "\n",
       ".xr-var-data > table {\n",
       "  float: right;\n",
       "}\n",
       "\n",
       ".xr-var-name span,\n",
       ".xr-var-data,\n",
       ".xr-attrs {\n",
       "  padding-left: 25px !important;\n",
       "}\n",
       "\n",
       ".xr-attrs,\n",
       ".xr-var-attrs,\n",
       ".xr-var-data {\n",
       "  grid-column: 1 / -1;\n",
       "}\n",
       "\n",
       "dl.xr-attrs {\n",
       "  padding: 0;\n",
       "  margin: 0;\n",
       "  display: grid;\n",
       "  grid-template-columns: 125px auto;\n",
       "}\n",
       "\n",
       ".xr-attrs dt, dd {\n",
       "  padding: 0;\n",
       "  margin: 0;\n",
       "  float: left;\n",
       "  padding-right: 10px;\n",
       "  width: auto;\n",
       "}\n",
       "\n",
       ".xr-attrs dt {\n",
       "  font-weight: normal;\n",
       "  grid-column: 1;\n",
       "}\n",
       "\n",
       ".xr-attrs dt:hover span {\n",
       "  display: inline-block;\n",
       "  background: var(--xr-background-color);\n",
       "  padding-right: 10px;\n",
       "}\n",
       "\n",
       ".xr-attrs dd {\n",
       "  grid-column: 2;\n",
       "  white-space: pre-wrap;\n",
       "  word-break: break-all;\n",
       "}\n",
       "\n",
       ".xr-icon-database,\n",
       ".xr-icon-file-text2 {\n",
       "  display: inline-block;\n",
       "  vertical-align: middle;\n",
       "  width: 1em;\n",
       "  height: 1.5em !important;\n",
       "  stroke-width: 0;\n",
       "  stroke: currentColor;\n",
       "  fill: currentColor;\n",
       "}\n",
       "</style><pre class='xr-text-repr-fallback'>&lt;xarray.Dataset&gt;\n",
       "Dimensions:  (lat: 25, lon: 53, time: 2920)\n",
       "Coordinates:\n",
       "  * lat      (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
       "  * lon      (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
       "  * time     (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
       "Data variables:\n",
       "    air      (time, lat, lon) float32 ...\n",
       "Attributes:\n",
       "    Conventions:  COARDS\n",
       "    title:        4x daily NMC reanalysis (1948)\n",
       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
       "    platform:     Model\n",
       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...</pre><div class='xr-wrap' hidden><div class='xr-header'><div class='xr-obj-type'>xarray.Dataset</div></div><ul class='xr-sections'><li class='xr-section-item'><input id='section-dfd13d8e-4433-4254-b735-e8ca0b13fbb3' class='xr-section-summary-in' type='checkbox' disabled ><label for='section-dfd13d8e-4433-4254-b735-e8ca0b13fbb3' class='xr-section-summary'  title='Expand/collapse section'>Dimensions:</label><div class='xr-section-inline-details'><ul class='xr-dim-list'><li><span class='xr-has-index'>lat</span>: 25</li><li><span class='xr-has-index'>lon</span>: 53</li><li><span class='xr-has-index'>time</span>: 2920</li></ul></div><div class='xr-section-details'></div></li><li class='xr-section-item'><input id='section-4e2d48c0-dc00-482f-9a95-58e8864d594b' class='xr-section-summary-in' type='checkbox'  checked><label for='section-4e2d48c0-dc00-482f-9a95-58e8864d594b' class='xr-section-summary' >Coordinates: <span>(3)</span></label><div class='xr-section-inline-details'></div><div class='xr-section-details'><ul class='xr-var-list'><li class='xr-var-item'><div class='xr-var-name'><span class='xr-has-index'>lat</span></div><div class='xr-var-dims'>(lat)</div><div class='xr-var-dtype'>float32</div><div class='xr-var-preview xr-preview'>75.0 72.5 70.0 ... 20.0 17.5 15.0</div><input id='attrs-f848eb78-8b71-4655-a9f4-76e069e29057' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-f848eb78-8b71-4655-a9f4-76e069e29057' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-61fcd39d-b03f-4612-bc39-ce48316632fb' class='xr-var-data-in' type='checkbox'><label for='data-61fcd39d-b03f-4612-bc39-ce48316632fb' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>standard_name :</span></dt><dd>latitude</dd><dt><span>long_name :</span></dt><dd>Latitude</dd><dt><span>units :</span></dt><dd>degrees_north</dd><dt><span>axis :</span></dt><dd>Y</dd></dl></div><div class='xr-var-data'><pre>array([75. , 72.5, 70. , 67.5, 65. , 62.5, 60. , 57.5, 55. , 52.5, 50. , 47.5,\n",
       "       45. , 42.5, 40. , 37.5, 35. , 32.5, 30. , 27.5, 25. , 22.5, 20. , 17.5,\n",
       "       15. ], dtype=float32)</pre></div></li><li class='xr-var-item'><div class='xr-var-name'><span class='xr-has-index'>lon</span></div><div class='xr-var-dims'>(lon)</div><div class='xr-var-dtype'>float32</div><div class='xr-var-preview xr-preview'>200.0 202.5 205.0 ... 327.5 330.0</div><input id='attrs-1c42a6a7-1034-4359-b571-022ed7b45edf' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-1c42a6a7-1034-4359-b571-022ed7b45edf' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-732ac6d3-1d2b-49a5-8b9e-e0d6e47f4eb7' class='xr-var-data-in' type='checkbox'><label for='data-732ac6d3-1d2b-49a5-8b9e-e0d6e47f4eb7' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>standard_name :</span></dt><dd>longitude</dd><dt><span>long_name :</span></dt><dd>Longitude</dd><dt><span>units :</span></dt><dd>degrees_east</dd><dt><span>axis :</span></dt><dd>X</dd></dl></div><div class='xr-var-data'><pre>array([200. , 202.5, 205. , 207.5, 210. , 212.5, 215. , 217.5, 220. , 222.5,\n",
       "       225. , 227.5, 230. , 232.5, 235. , 237.5, 240. , 242.5, 245. , 247.5,\n",
       "       250. , 252.5, 255. , 257.5, 260. , 262.5, 265. , 267.5, 270. , 272.5,\n",
       "       275. , 277.5, 280. , 282.5, 285. , 287.5, 290. , 292.5, 295. , 297.5,\n",
       "       300. , 302.5, 305. , 307.5, 310. , 312.5, 315. , 317.5, 320. , 322.5,\n",
       "       325. , 327.5, 330. ], dtype=float32)</pre></div></li><li class='xr-var-item'><div class='xr-var-name'><span class='xr-has-index'>time</span></div><div class='xr-var-dims'>(time)</div><div class='xr-var-dtype'>datetime64[ns]</div><div class='xr-var-preview xr-preview'>2013-01-01 ... 2014-12-31T18:00:00</div><input id='attrs-e60f7082-e5ab-445d-a478-58d68a6ff6d7' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-e60f7082-e5ab-445d-a478-58d68a6ff6d7' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-29b258fe-0129-495d-8f96-ab8697eaa374' class='xr-var-data-in' type='checkbox'><label for='data-29b258fe-0129-495d-8f96-ab8697eaa374' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>standard_name :</span></dt><dd>time</dd><dt><span>long_name :</span></dt><dd>Time</dd></dl></div><div class='xr-var-data'><pre>array([&#x27;2013-01-01T00:00:00.000000000&#x27;, &#x27;2013-01-01T06:00:00.000000000&#x27;,\n",
       "       &#x27;2013-01-01T12:00:00.000000000&#x27;, ..., &#x27;2014-12-31T06:00:00.000000000&#x27;,\n",
       "       &#x27;2014-12-31T12:00:00.000000000&#x27;, &#x27;2014-12-31T18:00:00.000000000&#x27;],\n",
       "      dtype=&#x27;datetime64[ns]&#x27;)</pre></div></li></ul></div></li><li class='xr-section-item'><input id='section-52573bf5-8ddc-4d57-840f-d83b96af4671' class='xr-section-summary-in' type='checkbox'  checked><label for='section-52573bf5-8ddc-4d57-840f-d83b96af4671' class='xr-section-summary' >Data variables: <span>(1)</span></label><div class='xr-section-inline-details'></div><div class='xr-section-details'><ul class='xr-var-list'><li class='xr-var-item'><div class='xr-var-name'><span>air</span></div><div class='xr-var-dims'>(time, lat, lon)</div><div class='xr-var-dtype'>float32</div><div class='xr-var-preview xr-preview'>...</div><input id='attrs-d3542941-fa67-4ffe-8bf2-c16a9263eb37' class='xr-var-attrs-in' type='checkbox' ><label for='attrs-d3542941-fa67-4ffe-8bf2-c16a9263eb37' title='Show/Hide attributes'><svg class='icon xr-icon-file-text2'><use xlink:href='#icon-file-text2'></use></svg></label><input id='data-57c62f1a-f3eb-43d1-a69f-fca6666fedfc' class='xr-var-data-in' type='checkbox'><label for='data-57c62f1a-f3eb-43d1-a69f-fca6666fedfc' title='Show/Hide data repr'><svg class='icon xr-icon-database'><use xlink:href='#icon-database'></use></svg></label><div class='xr-var-attrs'><dl class='xr-attrs'><dt><span>long_name :</span></dt><dd>4xDaily Air temperature at sigma level 995</dd><dt><span>units :</span></dt><dd>degK</dd><dt><span>precision :</span></dt><dd>2</dd><dt><span>GRIB_id :</span></dt><dd>11</dd><dt><span>GRIB_name :</span></dt><dd>TMP</dd><dt><span>var_desc :</span></dt><dd>Air temperature</dd><dt><span>dataset :</span></dt><dd>NMC Reanalysis</dd><dt><span>level_desc :</span></dt><dd>Surface</dd><dt><span>statistic :</span></dt><dd>Individual Obs</dd><dt><span>parent_stat :</span></dt><dd>Other</dd><dt><span>actual_range :</span></dt><dd>[185.16 322.1 ]</dd></dl></div><div class='xr-var-data'><pre>[3869000 values with dtype=float32]</pre></div></li></ul></div></li><li class='xr-section-item'><input id='section-d69f0756-55ba-4e13-831d-a1e89de74ea7' class='xr-section-summary-in' type='checkbox'  checked><label for='section-d69f0756-55ba-4e13-831d-a1e89de74ea7' class='xr-section-summary' >Attributes: <span>(5)</span></label><div class='xr-section-inline-details'></div><div class='xr-section-details'><dl class='xr-attrs'><dt><span>Conventions :</span></dt><dd>COARDS</dd><dt><span>title :</span></dt><dd>4x daily NMC reanalysis (1948)</dd><dt><span>description :</span></dt><dd>Data is from NMC initialized reanalysis\n",
       "(4x/day).  These are the 0.9950 sigma level values.</dd><dt><span>platform :</span></dt><dd>Model</dd><dt><span>references :</span></dt><dd>http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.html</dd></dl></div></li></ul></div></div>"
      ],
      "text/plain": [
       "<xarray.Dataset>\n",
       "Dimensions:  (lat: 25, lon: 53, time: 2920)\n",
       "Coordinates:\n",
       "  * lat      (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
       "  * lon      (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
       "  * time     (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
       "Data variables:\n",
       "    air      (time, lat, lon) float32 ...\n",
       "Attributes:\n",
       "    Conventions:  COARDS\n",
       "    title:        4x daily NMC reanalysis (1948)\n",
       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
       "    platform:     Model\n",
       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..."
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xr.open_dataset(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ncfunc(f):\n",
    "    # open file\n",
    "    d = nc.Dataset(f, 'r')\n",
    "\n",
    "    #open time variable and pull values\n",
    "    # find what the time (unlimited) dimension is\n",
    "    if 'time' in d.variables.keys():\n",
    "        times = d['time']\n",
    "        start = str(times[0])\n",
    "        end = str(times[-1])\n",
    "        date = start + \"-\" + end\n",
    "\n",
    "    #go through the variables\n",
    "    var_list = []\n",
    "    # loop through all variables\n",
    "    for v in d.variables.keys():\n",
    "        # add all variables that are not coordinates to the catalog\n",
    "        if v not in list(dict(d.dimensions).keys()):\n",
    "            var_list.append(v)\n",
    "\n",
    "    #go through attributes\n",
    "    attr_list = {}\n",
    "    for v in var_list:\n",
    "        if hasattr(d.variables[v], 'units'):\n",
    "            attr_list[v] = getattr(d.variables[v], 'units')\n",
    "\n",
    "    #close file\n",
    "    # close netcdf file\n",
    "    d.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def xfunc(f):\n",
    "    # open file\n",
    "    d = xr.open_dataset(f, decode_times=True, use_cftime=True, chunks={})\n",
    "\n",
    "    # get time variable\n",
    "    # find what the time (unlimited) dimension is\n",
    "    if 'time' in d.coords:\n",
    "        times = d['time']\n",
    "        start = times[0].dt.strftime('%Y-%m-%d').data.item()\n",
    "        end = times[-1].dt.strftime('%Y-%m-%d').data.item()\n",
    "        date = start + \"-\" + end\n",
    "\n",
    "    # got through variable list\n",
    "    var_list = []    \n",
    "    # loop through all variables\n",
    "    for v in d.variables.keys():\n",
    "        # add all variables that are not coordinates to the catalog\n",
    "        if v not in d.coords:\n",
    "            var_list.append(v)\n",
    "\n",
    "    #go through attr list\n",
    "    attr_list = {}\n",
    "    for v in var_list:\n",
    "        if hasattr(d.variables[v], 'units'):\n",
    "            attr_list[v] = getattr(d.variables[v], 'units')\n",
    "\n",
    "    #close the file\n",
    "    # close netcdf file\n",
    "    d.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext line_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timer unit: 1e-06 s\n",
       "\n",
       "Total time: 0.047982 s\n",
       "File: <ipython-input-5-ddc292be8500>\n",
       "Function: xfunc at line 1\n",
       "\n",
       "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
       "==============================================================\n",
       "     1                                           def xfunc(f):\n",
       "     2                                               # open file\n",
       "     3         1      43118.0  43118.0     89.9      d = xr.open_dataset(f, decode_times=True, use_cftime=True, chunks={})\n",
       "     4                                           \n",
       "     5                                               # get time variable\n",
       "     6                                               # find what the time (unlimited) dimension is\n",
       "     7         1         19.0     19.0      0.0      if 'time' in d.coords:\n",
       "     8         1         83.0     83.0      0.2          times = d['time']\n",
       "     9         1       2536.0   2536.0      5.3          start = times[0].dt.strftime('%Y-%m-%d').data.item()\n",
       "    10         1       2094.0   2094.0      4.4          end = times[-1].dt.strftime('%Y-%m-%d').data.item()\n",
       "    11         1          3.0      3.0      0.0          date = start + \"-\" + end\n",
       "    12                                           \n",
       "    13                                               # got through variable list\n",
       "    14         1          0.0      0.0      0.0      var_list = []    \n",
       "    15                                               # loop through all variables\n",
       "    16         5         15.0      3.0      0.0      for v in d.variables.keys():\n",
       "    17                                                   # add all variables that are not coordinates to the catalog\n",
       "    18         4         20.0      5.0      0.0          if v not in d.coords:\n",
       "    19         1          1.0      1.0      0.0              var_list.append(v)\n",
       "    20                                           \n",
       "    21                                               #go through attr list\n",
       "    22         1          1.0      1.0      0.0      attr_list = {}\n",
       "    23         2          2.0      1.0      0.0      for v in var_list:\n",
       "    24         1          6.0      6.0      0.0          if hasattr(d.variables[v], 'units'):\n",
       "    25                                                       attr_list[v] = getattr(d.variables[v], 'units')\n",
       "    26                                           \n",
       "    27                                               #close the file\n",
       "    28                                               # close netcdf file\n",
       "    29         1         84.0     84.0      0.2      d.close()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%lprun -f xfunc xfunc(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timer unit: 1e-06 s\n",
       "\n",
       "Total time: 0.007543 s\n",
       "File: /glade/work/abanihi/devel/misc/blog-posts/api.py\n",
       "Function: open_dataset at line 252\n",
       "\n",
       "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
       "==============================================================\n",
       "   252                                           def open_dataset(\n",
       "   253                                               filename_or_obj,\n",
       "   254                                               group=None,\n",
       "   255                                               decode_cf=True,\n",
       "   256                                               mask_and_scale=None,\n",
       "   257                                               decode_times=True,\n",
       "   258                                               autoclose=None,\n",
       "   259                                               concat_characters=True,\n",
       "   260                                               decode_coords=True,\n",
       "   261                                               engine=None,\n",
       "   262                                               chunks=None,\n",
       "   263                                               lock=None,\n",
       "   264                                               cache=None,\n",
       "   265                                               drop_variables=None,\n",
       "   266                                               backend_kwargs=None,\n",
       "   267                                               use_cftime=None,\n",
       "   268                                               decode_timedelta=None,\n",
       "   269                                           ):\n",
       "   270                                               \"\"\"Open and decode a dataset from a file or file-like object.\n",
       "   271                                               Parameters\n",
       "   272                                               ----------\n",
       "   273                                               filename_or_obj : str, Path, file or xarray.backends.*DataStore\n",
       "   274                                                   Strings and Path objects are interpreted as a path to a netCDF file\n",
       "   275                                                   or an OpenDAP URL and opened with python-netCDF4, unless the filename\n",
       "   276                                                   ends with .gz, in which case the file is gunzipped and opened with\n",
       "   277                                                   scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like\n",
       "   278                                                   objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).\n",
       "   279                                               group : str, optional\n",
       "   280                                                   Path to the netCDF4 group in the given file to open (only works for\n",
       "   281                                                   netCDF4 files).\n",
       "   282                                               decode_cf : bool, optional\n",
       "   283                                                   Whether to decode these variables, assuming they were saved according\n",
       "   284                                                   to CF conventions.\n",
       "   285                                               mask_and_scale : bool, optional\n",
       "   286                                                   If True, replace array values equal to `_FillValue` with NA and scale\n",
       "   287                                                   values according to the formula `original_values * scale_factor +\n",
       "   288                                                   add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are\n",
       "   289                                                   taken from variable attributes (if they exist).  If the `_FillValue` or\n",
       "   290                                                   `missing_value` attribute contains multiple values a warning will be\n",
       "   291                                                   issued and all array values matching one of the multiple values will\n",
       "   292                                                   be replaced by NA. mask_and_scale defaults to True except for the\n",
       "   293                                                   pseudonetcdf backend.\n",
       "   294                                               decode_times : bool, optional\n",
       "   295                                                   If True, decode times encoded in the standard NetCDF datetime format\n",
       "   296                                                   into datetime objects. Otherwise, leave them encoded as numbers.\n",
       "   297                                               autoclose : bool, optional\n",
       "   298                                                   If True, automatically close files to avoid OS Error of too many files\n",
       "   299                                                   being open.  However, this option doesn't work with streams, e.g.,\n",
       "   300                                                   BytesIO.\n",
       "   301                                               concat_characters : bool, optional\n",
       "   302                                                   If True, concatenate along the last dimension of character arrays to\n",
       "   303                                                   form string arrays. Dimensions will only be concatenated over (and\n",
       "   304                                                   removed) if they have no corresponding variable and if they are only\n",
       "   305                                                   used as the last dimension of character arrays.\n",
       "   306                                               decode_coords : bool, optional\n",
       "   307                                                   If True, decode the 'coordinates' attribute to identify coordinates in\n",
       "   308                                                   the resulting dataset.\n",
       "   309                                               engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib', \\\n",
       "   310                                                   'pseudonetcdf'}, optional\n",
       "   311                                                   Engine to use when reading files. If not provided, the default engine\n",
       "   312                                                   is chosen based on available dependencies, with a preference for\n",
       "   313                                                   'netcdf4'.\n",
       "   314                                               chunks : int or dict, optional\n",
       "   315                                                   If chunks is provided, it used to load the new dataset into dask\n",
       "   316                                                   arrays. ``chunks={}`` loads the dataset with dask using a single\n",
       "   317                                                   chunk for all arrays.\n",
       "   318                                               lock : False or duck threading.Lock, optional\n",
       "   319                                                   Resource lock to use when reading data from disk. Only relevant when\n",
       "   320                                                   using dask or another form of parallelism. By default, appropriate\n",
       "   321                                                   locks are chosen to safely read and write files with the currently\n",
       "   322                                                   active dask scheduler.\n",
       "   323                                               cache : bool, optional\n",
       "   324                                                   If True, cache data loaded from the underlying datastore in memory as\n",
       "   325                                                   NumPy arrays when accessed to avoid reading from the underlying data-\n",
       "   326                                                   store multiple times. Defaults to True unless you specify the `chunks`\n",
       "   327                                                   argument to use dask, in which case it defaults to False. Does not\n",
       "   328                                                   change the behavior of coordinates corresponding to dimensions, which\n",
       "   329                                                   always load their data from disk into a ``pandas.Index``.\n",
       "   330                                               drop_variables: string or iterable, optional\n",
       "   331                                                   A variable or list of variables to exclude from being parsed from the\n",
       "   332                                                   dataset. This may be useful to drop variables with problems or\n",
       "   333                                                   inconsistent values.\n",
       "   334                                               backend_kwargs: dictionary, optional\n",
       "   335                                                   A dictionary of keyword arguments to pass on to the backend. This\n",
       "   336                                                   may be useful when backend options would improve performance or\n",
       "   337                                                   allow user control of dataset processing.\n",
       "   338                                               use_cftime: bool, optional\n",
       "   339                                                   Only relevant if encoded dates come from a standard calendar\n",
       "   340                                                   (e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not\n",
       "   341                                                   specified).  If None (default), attempt to decode times to\n",
       "   342                                                   ``np.datetime64[ns]`` objects; if this is not possible, decode times to\n",
       "   343                                                   ``cftime.datetime`` objects. If True, always decode times to\n",
       "   344                                                   ``cftime.datetime`` objects, regardless of whether or not they can be\n",
       "   345                                                   represented using ``np.datetime64[ns]`` objects.  If False, always\n",
       "   346                                                   decode times to ``np.datetime64[ns]`` objects; if this is not possible\n",
       "   347                                                   raise an error.\n",
       "   348                                               decode_timedelta : bool, optional\n",
       "   349                                                   If True, decode variables and coordinates with time units in\n",
       "   350                                                   {'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'}\n",
       "   351                                                   into timedelta objects. If False, leave them encoded as numbers.\n",
       "   352                                                   If None (default), assume the same value of decode_time.\n",
       "   353                                               Returns\n",
       "   354                                               -------\n",
       "   355                                               dataset : Dataset\n",
       "   356                                                   The newly created dataset.\n",
       "   357                                               Notes\n",
       "   358                                               -----\n",
       "   359                                               ``open_dataset`` opens the file with read-only access. When you modify\n",
       "   360                                               values of a Dataset, even one linked to files on disk, only the in-memory\n",
       "   361                                               copy you are manipulating in xarray is modified: the original file on disk\n",
       "   362                                               is never touched.\n",
       "   363                                               See Also\n",
       "   364                                               --------\n",
       "   365                                               open_mfdataset\n",
       "   366                                               \"\"\"\n",
       "   367                                               engines = [\n",
       "   368         1          2.0      2.0      0.0          None,\n",
       "   369         1          1.0      1.0      0.0          \"netcdf4\",\n",
       "   370         1          1.0      1.0      0.0          \"scipy\",\n",
       "   371         1          1.0      1.0      0.0          \"pydap\",\n",
       "   372         1          1.0      1.0      0.0          \"h5netcdf\",\n",
       "   373         1          1.0      1.0      0.0          \"pynio\",\n",
       "   374         1          1.0      1.0      0.0          \"cfgrib\",\n",
       "   375         1          0.0      0.0      0.0          \"pseudonetcdf\",\n",
       "   376                                               ]\n",
       "   377         1          1.0      1.0      0.0      if engine not in engines:\n",
       "   378                                                   raise ValueError(\n",
       "   379                                                       \"unrecognized engine for open_dataset: {}\\n\"\n",
       "   380                                                       \"must be one of: {}\".format(engine, engines)\n",
       "   381                                                   )\n",
       "   382                                           \n",
       "   383         1          1.0      1.0      0.0      if autoclose is not None:\n",
       "   384                                                   warnings.warn(\n",
       "   385                                                       \"The autoclose argument is no longer used by \"\n",
       "   386                                                       \"xarray.open_dataset() and is now ignored; it will be removed in \"\n",
       "   387                                                       \"a future version of xarray. If necessary, you can control the \"\n",
       "   388                                                       \"maximum number of simultaneous open files with \"\n",
       "   389                                                       \"xarray.set_options(file_cache_maxsize=...).\",\n",
       "   390                                                       FutureWarning,\n",
       "   391                                                       stacklevel=2,\n",
       "   392                                                   )\n",
       "   393                                           \n",
       "   394         1          0.0      0.0      0.0      if mask_and_scale is None:\n",
       "   395         1          0.0      0.0      0.0          mask_and_scale = not engine == \"pseudonetcdf\"\n",
       "   396                                           \n",
       "   397         1          1.0      1.0      0.0      if not decode_cf:\n",
       "   398                                                   mask_and_scale = False\n",
       "   399                                                   decode_times = False\n",
       "   400                                                   concat_characters = False\n",
       "   401                                                   decode_coords = False\n",
       "   402                                                   decode_timedelta = False\n",
       "   403                                           \n",
       "   404         1          0.0      0.0      0.0      if cache is None:\n",
       "   405         1          1.0      1.0      0.0          cache = chunks is None\n",
       "   406                                           \n",
       "   407         1          1.0      1.0      0.0      if backend_kwargs is None:\n",
       "   408         1          0.0      0.0      0.0          backend_kwargs = {}\n",
       "   409                                                   \n",
       "   410         1          2.0      2.0      0.0      if isinstance(filename_or_obj, Path):\n",
       "   411                                                   filename_or_obj = str(filename_or_obj)\n",
       "   412                                           \n",
       "   413         1          4.0      4.0      0.1      if isinstance(filename_or_obj, AbstractDataStore):\n",
       "   414                                                   store = filename_or_obj\n",
       "   415                                           \n",
       "   416         1          1.0      1.0      0.0      elif isinstance(filename_or_obj, str):\n",
       "   417         1         55.0     55.0      0.7          filename_or_obj = _normalize_path(filename_or_obj)\n",
       "   418                                           \n",
       "   419         1          1.0      1.0      0.0          if engine is None:\n",
       "   420                                                       engine = _get_default_engine(filename_or_obj, allow_remote=True)\n",
       "   421         1          1.0      1.0      0.0          if engine == \"netcdf4\":\n",
       "   422         1          2.0      2.0      0.0              store = backends.NetCDF4DataStore.open(\n",
       "   423         1        745.0    745.0      9.9                  filename_or_obj, group=group, lock=lock, **backend_kwargs\n",
       "   424                                                       )\n",
       "   425                                                       \n",
       "   426                                                   elif engine == \"scipy\":\n",
       "   427                                                       store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)\n",
       "   428                                                   elif engine == \"pydap\":\n",
       "   429                                                       store = backends.PydapDataStore.open(filename_or_obj, **backend_kwargs)\n",
       "   430                                                   elif engine == \"h5netcdf\":\n",
       "   431                                                       store = backends.H5NetCDFStore.open(\n",
       "   432                                                           filename_or_obj, group=group, lock=lock, **backend_kwargs\n",
       "   433                                                       )\n",
       "   434                                                   elif engine == \"pynio\":\n",
       "   435                                                       store = backends.NioDataStore(filename_or_obj, lock=lock, **backend_kwargs)\n",
       "   436                                                   elif engine == \"pseudonetcdf\":\n",
       "   437                                                       store = backends.PseudoNetCDFDataStore.open(\n",
       "   438                                                           filename_or_obj, lock=lock, **backend_kwargs\n",
       "   439                                                       )\n",
       "   440                                                   elif engine == \"cfgrib\":\n",
       "   441                                                       store = backends.CfGribDataStore(\n",
       "   442                                                           filename_or_obj, lock=lock, **backend_kwargs\n",
       "   443                                                       )\n",
       "   444                                           \n",
       "   445                                               else:\n",
       "   446                                                   if engine not in [None, \"scipy\", \"h5netcdf\"]:\n",
       "   447                                                       raise ValueError(\n",
       "   448                                                           \"can only read bytes or file-like objects \"\n",
       "   449                                                           \"with engine='scipy' or 'h5netcdf'\"\n",
       "   450                                                       )\n",
       "   451                                                   engine = _get_engine_from_magic_number(filename_or_obj)\n",
       "   452                                                   if engine == \"scipy\":\n",
       "   453                                                       store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)\n",
       "   454                                                   elif engine == \"h5netcdf\":\n",
       "   455                                                       store = backends.H5NetCDFStore.open(\n",
       "   456                                                           filename_or_obj, group=group, lock=lock, **backend_kwargs\n",
       "   457                                                       )\n",
       "   458                                                   \n",
       "   459         1         13.0     13.0      0.2      with close_on_error(store):\n",
       "   460         1       6700.0   6700.0     88.8          ds = maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine)\n",
       "   461                                           \n",
       "   462                                               # Ensure source filename always stored in dataset object (GH issue #2550)\n",
       "   463         1          3.0      3.0      0.0      if \"source\" not in ds.encoding:\n",
       "   464         1          1.0      1.0      0.0          if isinstance(filename_or_obj, str):\n",
       "   465         1          2.0      2.0      0.0              ds.encoding[\"source\"] = filename_or_obj\n",
       "   466                                           \n",
       "   467         1          0.0      0.0      0.0      return ds"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%lprun -s -f xr.open_dataset xr.open_dataset(f, engine='netcdf4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timer unit: 1e-06 s\n",
       "\n",
       "Total time: 0.006774 s\n",
       "File: /glade/work/abanihi/devel/misc/blog-posts/api.py\n",
       "Function: maybe_decode_store at line 205\n",
       "\n",
       "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
       "==============================================================\n",
       "   205                                           def maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine, lock=False):\n",
       "   206         1          3.0      3.0      0.0      ds = conventions.decode_cf(\n",
       "   207         1          1.0      1.0      0.0              store,\n",
       "   208         1          1.0      1.0      0.0              mask_and_scale=mask_and_scale,\n",
       "   209         1          1.0      1.0      0.0              decode_times=decode_times,\n",
       "   210         1          1.0      1.0      0.0              concat_characters=concat_characters,\n",
       "   211         1          1.0      1.0      0.0              decode_coords=decode_coords,\n",
       "   212         1          0.0      0.0      0.0              drop_variables=drop_variables,\n",
       "   213         1          0.0      0.0      0.0              use_cftime=use_cftime,\n",
       "   214         1       6703.0   6703.0     99.0              decode_timedelta=decode_timedelta,\n",
       "   215                                                   )\n",
       "   216                                           \n",
       "   217         1         60.0     60.0      0.9      _protect_dataset_variables_inplace(ds, cache)\n",
       "   218                                           \n",
       "   219         1          1.0      1.0      0.0      if chunks is not None:\n",
       "   220                                                   from dask.base import tokenize\n",
       "   221                                           \n",
       "   222                                                   # if passed an actual file path, augment the token with\n",
       "   223                                                   # the file modification time\n",
       "   224                                                   if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):\n",
       "   225                                                       mtime = os.path.getmtime(filename_or_obj)\n",
       "   226                                                   else:\n",
       "   227                                                       mtime = None\n",
       "   228                                                   token = tokenize(\n",
       "   229                                                       filename_or_obj,\n",
       "   230                                                       mtime,\n",
       "   231                                                       group,\n",
       "   232                                                       decode_cf,\n",
       "   233                                                       mask_and_scale,\n",
       "   234                                                       decode_times,\n",
       "   235                                                       concat_characters,\n",
       "   236                                                       decode_coords,\n",
       "   237                                                       engine,\n",
       "   238                                                       chunks,\n",
       "   239                                                       drop_variables,\n",
       "   240                                                       use_cftime,\n",
       "   241                                                       decode_timedelta,\n",
       "   242                                                   )\n",
       "   243                                                   name_prefix = \"open_dataset-%s\" % token\n",
       "   244                                                   ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)\n",
       "   245                                                   ds2._file_obj = ds._file_obj\n",
       "   246                                               else:\n",
       "   247         1          1.0      1.0      0.0          ds2 = ds\n",
       "   248                                           \n",
       "   249         1          1.0      1.0      0.0      return ds2"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%lprun -s -f maybe_decode_store xr.open_dataset(f, engine='netcdf4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from xarray.conventions import decode_cf, decode_cf_variable\n",
    "import xarray.conventions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timer unit: 1e-06 s\n",
       "\n",
       "Total time: 0.00666 s\n",
       "File: /glade/work/abanihi/softwares/miniconda3/envs/playground/lib/python3.7/site-packages/xarray/conventions.py\n",
       "Function: decode_cf at line 517\n",
       "\n",
       "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
       "==============================================================\n",
       "   517                                           def decode_cf(\n",
       "   518                                               obj,\n",
       "   519                                               concat_characters=True,\n",
       "   520                                               mask_and_scale=True,\n",
       "   521                                               decode_times=True,\n",
       "   522                                               decode_coords=True,\n",
       "   523                                               drop_variables=None,\n",
       "   524                                               use_cftime=None,\n",
       "   525                                               decode_timedelta=None,\n",
       "   526                                           ):\n",
       "   527                                               \"\"\"Decode the given Dataset or Datastore according to CF conventions into\n",
       "   528                                               a new Dataset.\n",
       "   529                                           \n",
       "   530                                               Parameters\n",
       "   531                                               ----------\n",
       "   532                                               obj : Dataset or DataStore\n",
       "   533                                                   Object to decode.\n",
       "   534                                               concat_characters : bool, optional\n",
       "   535                                                   Should character arrays be concatenated to strings, for\n",
       "   536                                                   example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'\n",
       "   537                                               mask_and_scale: bool, optional\n",
       "   538                                                   Lazily scale (using scale_factor and add_offset) and mask\n",
       "   539                                                   (using _FillValue).\n",
       "   540                                               decode_times : bool, optional\n",
       "   541                                                   Decode cf times (e.g., integers since 'hours since 2000-01-01') to\n",
       "   542                                                   np.datetime64.\n",
       "   543                                               decode_coords : bool, optional\n",
       "   544                                                   Use the 'coordinates' attribute on variable (or the dataset itself) to\n",
       "   545                                                   identify coordinates.\n",
       "   546                                               drop_variables: string or iterable, optional\n",
       "   547                                                   A variable or list of variables to exclude from being parsed from the\n",
       "   548                                                   dataset. This may be useful to drop variables with problems or\n",
       "   549                                                   inconsistent values.\n",
       "   550                                               use_cftime: bool, optional\n",
       "   551                                                   Only relevant if encoded dates come from a standard calendar\n",
       "   552                                                   (e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not\n",
       "   553                                                   specified).  If None (default), attempt to decode times to\n",
       "   554                                                   ``np.datetime64[ns]`` objects; if this is not possible, decode times to\n",
       "   555                                                   ``cftime.datetime`` objects. If True, always decode times to\n",
       "   556                                                   ``cftime.datetime`` objects, regardless of whether or not they can be\n",
       "   557                                                   represented using ``np.datetime64[ns]`` objects.  If False, always\n",
       "   558                                                   decode times to ``np.datetime64[ns]`` objects; if this is not possible\n",
       "   559                                                   raise an error.\n",
       "   560                                               decode_timedelta : bool, optional\n",
       "   561                                                   If True, decode variables and coordinates with time units in\n",
       "   562                                                   {'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'}\n",
       "   563                                                   into timedelta objects. If False, leave them encoded as numbers.\n",
       "   564                                                   If None (default), assume the same value of decode_time.\n",
       "   565                                           \n",
       "   566                                               Returns\n",
       "   567                                               -------\n",
       "   568                                               decoded : Dataset\n",
       "   569                                               \"\"\"\n",
       "   570         1          8.0      8.0      0.1      from .core.dataset import Dataset\n",
       "   571         1          5.0      5.0      0.1      from .backends.common import AbstractDataStore\n",
       "   572                                           \n",
       "   573         1          4.0      4.0      0.1      if isinstance(obj, Dataset):\n",
       "   574                                                   vars = obj._variables\n",
       "   575                                                   attrs = obj.attrs\n",
       "   576                                                   extra_coords = set(obj.coords)\n",
       "   577                                                   file_obj = obj._file_obj\n",
       "   578                                                   encoding = obj.encoding\n",
       "   579         1          2.0      2.0      0.0      elif isinstance(obj, AbstractDataStore):\n",
       "   580         1        855.0    855.0     12.8          vars, attrs = obj.load()\n",
       "   581         1          2.0      2.0      0.0          extra_coords = set()\n",
       "   582         1          0.0      0.0      0.0          file_obj = obj\n",
       "   583         1         37.0     37.0      0.6          encoding = obj.get_encoding()\n",
       "   584                                               else:\n",
       "   585                                                   raise TypeError(\"can only decode Dataset or DataStore objects\")\n",
       "   586                                           \n",
       "   587         1          0.0      0.0      0.0      vars, attrs, coord_names = decode_cf_variables(\n",
       "   588         1          1.0      1.0      0.0          vars,\n",
       "   589         1          0.0      0.0      0.0          attrs,\n",
       "   590         1          0.0      0.0      0.0          concat_characters,\n",
       "   591         1          1.0      1.0      0.0          mask_and_scale,\n",
       "   592         1          1.0      1.0      0.0          decode_times,\n",
       "   593         1          0.0      0.0      0.0          decode_coords,\n",
       "   594         1          1.0      1.0      0.0          drop_variables=drop_variables,\n",
       "   595         1          1.0      1.0      0.0          use_cftime=use_cftime,\n",
       "   596         1       2698.0   2698.0     40.5          decode_timedelta=decode_timedelta,\n",
       "   597                                               )\n",
       "   598         1       2863.0   2863.0     43.0      ds = Dataset(vars, attrs=attrs)\n",
       "   599         1        175.0    175.0      2.6      ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))\n",
       "   600         1          2.0      2.0      0.0      ds._file_obj = file_obj\n",
       "   601         1          4.0      4.0      0.1      ds.encoding = encoding\n",
       "   602                                           \n",
       "   603         1          0.0      0.0      0.0      return ds"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%lprun -s -f decode_cf xr.open_dataset(f, engine='netcdf4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timer unit: 1e-06 s\n",
       "\n",
       "Total time: 0.002416 s\n",
       "File: /glade/work/abanihi/softwares/miniconda3/envs/playground/lib/python3.7/site-packages/xarray/conventions.py\n",
       "Function: decode_cf_variable at line 260\n",
       "\n",
       "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
       "==============================================================\n",
       "   260                                           def decode_cf_variable(\n",
       "   261                                               name,\n",
       "   262                                               var,\n",
       "   263                                               concat_characters=True,\n",
       "   264                                               mask_and_scale=True,\n",
       "   265                                               decode_times=True,\n",
       "   266                                               decode_endianness=True,\n",
       "   267                                               stack_char_dim=True,\n",
       "   268                                               use_cftime=None,\n",
       "   269                                               decode_timedelta=None,\n",
       "   270                                           ):\n",
       "   271                                               \"\"\"\n",
       "   272                                               Decodes a variable which may hold CF encoded information.\n",
       "   273                                           \n",
       "   274                                               This includes variables that have been masked and scaled, which\n",
       "   275                                               hold CF style time variables (this is almost always the case if\n",
       "   276                                               the dataset has been serialized) and which have strings encoded\n",
       "   277                                               as character arrays.\n",
       "   278                                           \n",
       "   279                                               Parameters\n",
       "   280                                               ----------\n",
       "   281                                               name: str\n",
       "   282                                                   Name of the variable. Used for better error messages.\n",
       "   283                                               var : Variable\n",
       "   284                                                   A variable holding potentially CF encoded information.\n",
       "   285                                               concat_characters : bool\n",
       "   286                                                   Should character arrays be concatenated to strings, for\n",
       "   287                                                   example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'\n",
       "   288                                               mask_and_scale: bool\n",
       "   289                                                   Lazily scale (using scale_factor and add_offset) and mask\n",
       "   290                                                   (using _FillValue). If the _Unsigned attribute is present\n",
       "   291                                                   treat integer arrays as unsigned.\n",
       "   292                                               decode_times : bool\n",
       "   293                                                   Decode cf times ('hours since 2000-01-01') to np.datetime64.\n",
       "   294                                               decode_endianness : bool\n",
       "   295                                                   Decode arrays from non-native to native endianness.\n",
       "   296                                               stack_char_dim : bool\n",
       "   297                                                   Whether to stack characters into bytes along the last dimension of this\n",
       "   298                                                   array. Passed as an argument because we need to look at the full\n",
       "   299                                                   dataset to figure out if this is appropriate.\n",
       "   300                                               use_cftime: bool, optional\n",
       "   301                                                   Only relevant if encoded dates come from a standard calendar\n",
       "   302                                                   (e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not\n",
       "   303                                                   specified).  If None (default), attempt to decode times to\n",
       "   304                                                   ``np.datetime64[ns]`` objects; if this is not possible, decode times to\n",
       "   305                                                   ``cftime.datetime`` objects. If True, always decode times to\n",
       "   306                                                   ``cftime.datetime`` objects, regardless of whether or not they can be\n",
       "   307                                                   represented using ``np.datetime64[ns]`` objects.  If False, always\n",
       "   308                                                   decode times to ``np.datetime64[ns]`` objects; if this is not possible\n",
       "   309                                                   raise an error.\n",
       "   310                                           \n",
       "   311                                               Returns\n",
       "   312                                               -------\n",
       "   313                                               out : Variable\n",
       "   314                                                   A variable holding the decoded equivalent of var.\n",
       "   315                                               \"\"\"\n",
       "   316         4        137.0     34.2      5.7      var = as_variable(var)\n",
       "   317         4          6.0      1.5      0.2      original_dtype = var.dtype\n",
       "   318                                           \n",
       "   319         4          4.0      1.0      0.2      if decode_timedelta is None:\n",
       "   320         4          4.0      1.0      0.2          decode_timedelta = decode_times\n",
       "   321                                           \n",
       "   322         4          3.0      0.8      0.1      if concat_characters:\n",
       "   323         4          2.0      0.5      0.1          if stack_char_dim:\n",
       "   324                                                       var = strings.CharacterArrayCoder().decode(var, name=name)\n",
       "   325         4         77.0     19.2      3.2          var = strings.EncodedStringCoder().decode(var)\n",
       "   326                                           \n",
       "   327         4          1.0      0.2      0.0      if mask_and_scale:\n",
       "   328         4          1.0      0.2      0.0          for coder in [\n",
       "   329         4          4.0      1.0      0.2              variables.UnsignedIntegerCoder(),\n",
       "   330         4          4.0      1.0      0.2              variables.CFMaskCoder(),\n",
       "   331        16         12.0      0.8      0.5              variables.CFScaleOffsetCoder(),\n",
       "   332                                                   ]:\n",
       "   333        12        362.0     30.2     15.0              var = coder.decode(var, name=name)\n",
       "   334                                           \n",
       "   335         4          4.0      1.0      0.2      if decode_timedelta:\n",
       "   336         4         58.0     14.5      2.4          var = times.CFTimedeltaCoder().decode(var, name=name)\n",
       "   337         4          3.0      0.8      0.1      if decode_times:\n",
       "   338         4       1607.0    401.8     66.5          var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)\n",
       "   339                                           \n",
       "   340         4         13.0      3.2      0.5      dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)\n",
       "   341                                               # TODO(shoyer): convert everything below to use coders\n",
       "   342                                           \n",
       "   343         4          6.0      1.5      0.2      if decode_endianness and not data.dtype.isnative:\n",
       "   344                                                   # do this last, so it's only done if we didn't already unmask/scale\n",
       "   345                                                   data = NativeEndiannessArray(data)\n",
       "   346                                                   original_dtype = data.dtype\n",
       "   347                                           \n",
       "   348         4          4.0      1.0      0.2      encoding.setdefault(\"dtype\", original_dtype)\n",
       "   349                                           \n",
       "   350         4          2.0      0.5      0.1      if \"dtype\" in attributes and attributes[\"dtype\"] == \"bool\":\n",
       "   351                                                   del attributes[\"dtype\"]\n",
       "   352                                                   data = BoolTypeArray(data)\n",
       "   353                                           \n",
       "   354         4          3.0      0.8      0.1      if not isinstance(data, dask_array_type):\n",
       "   355         4         45.0     11.2      1.9          data = indexing.LazilyOuterIndexedArray(data)\n",
       "   356                                           \n",
       "   357         4         54.0     13.5      2.2      return Variable(dimensions, data, attributes, encoding=encoding)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%lprun -s -f xarray.conventions.decode_cf_variable xr.open_dataset(f, engine='netcdf4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	import os.path
	import warnings
	from glob import glob
	from io import BytesIO
	from numbers import Number
	from pathlib import Path
	from xarray.backends.common import AbstractDataStore, ArrayWriter
	from xarray.core.utils import close_on_error, is_grib_path, is_remote_uri
	from xarray.core import indexing
	from xarray import backends, conventions, coding


	def _get_default_engine_remote_uri():
	try:
	import netCDF4 # noqa: F401

	engine = "netcdf4"
	except ImportError: # pragma: no cover
	try:
	import pydap # noqa: F401

	engine = "pydap"
	except ImportError:
	raise ValueError(
	"netCDF4 or pydap is required for accessing "
	"remote datasets via OPeNDAP"
	)
	return engine


	def _get_default_engine_grib():
	msgs = []
	try:
	import Nio # noqa: F401

	msgs += ["set engine='pynio' to access GRIB files with PyNIO"]
	except ImportError: # pragma: no cover
	pass
	try:
	import cfgrib # noqa: F401

	msgs += ["set engine='cfgrib' to access GRIB files with cfgrib"]
	except ImportError: # pragma: no cover
	pass
	if msgs:
	raise ValueError(" or\n".join(msgs))
	else:
	raise ValueError("PyNIO or cfgrib is required for accessing " "GRIB files")


	def _get_default_engine_gz():
	try:
	import scipy # noqa: F401

	engine = "scipy"
	except ImportError: # pragma: no cover
	raise ValueError("scipy is required for accessing .gz files")
	return engine


	def _get_default_engine_netcdf():
	try:
	import netCDF4 # noqa: F401

	engine = "netcdf4"
	except ImportError: # pragma: no cover
	try:
	import scipy.io.netcdf # noqa: F401

	engine = "scipy"
	except ImportError:
	raise ValueError(
	"cannot read or write netCDF files without "
	"netCDF4-python or scipy installed"
	)
	return engine


	def _get_engine_from_magic_number(filename_or_obj):
	# check byte header to determine file type
	if isinstance(filename_or_obj, bytes):
	magic_number = filename_or_obj[:8]
	else:
	if filename_or_obj.tell() != 0:
	raise ValueError(
	"file-like object read/write pointer not at zero "
	"please close and reopen, or use a context "
	"manager"
	)
	magic_number = filename_or_obj.read(8)
	filename_or_obj.seek(0)

	if magic_number.startswith(b"CDF"):
	engine = "scipy"
	elif magic_number.startswith(b"\211HDF\r\n\032\n"):
	engine = "h5netcdf"
	if isinstance(filename_or_obj, bytes):
	raise ValueError(
	"can't open netCDF4/HDF5 as bytes "
	"try passing a path or file-like object"
	)
	else:
	if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
	filename_or_obj = filename_or_obj[:80] + b"..."
	raise ValueError(
	"{} is not a valid netCDF file "
	"did you mean to pass a string for a path instead?".format(filename_or_obj)
	)
	return engine


	def _get_default_engine(path, allow_remote=False):
	if allow_remote and is_remote_uri(path):
	engine = _get_default_engine_remote_uri()
	elif is_grib_path(path):
	engine = _get_default_engine_grib()
	elif path.endswith(".gz"):
	engine = _get_default_engine_gz()
	else:
	engine = _get_default_engine_netcdf()
	return engine


	def _normalize_path(path):
	if is_remote_uri(path):
	return path
	else:
	return os.path.abspath(os.path.expanduser(path))


	def _validate_dataset_names(dataset):
	"""DataArray.name and Dataset keys must be a string or None"""

	def check_name(name):
	if isinstance(name, str):
	if not name:
	raise ValueError(
	"Invalid name for DataArray or Dataset key: "
	"string must be length 1 or greater for "
	"serialization to netCDF files"
	)
	elif name is not None:
	raise TypeError(
	"DataArray.name or Dataset key must be either a "
	"string or None for serialization to netCDF files"
	)

	for k in dataset.variables:
	check_name(k)


	def _validate_attrs(dataset):
	"""`attrs` must have a string key and a value which is either: a number,
	a string, an ndarray or a list/tuple of numbers/strings.
	"""

	def check_attr(name, value):
	if isinstance(name, str):
	if not name:
	raise ValueError(
	"Invalid name for attr: string must be "
	"length 1 or greater for serialization to "
	"netCDF files"
	)
	else:
	raise TypeError(
	"Invalid name for attr: {} must be a string for "
	"serialization to netCDF files".format(name)
	)

	if not isinstance(value, (str, Number, np.ndarray, np.number, list, tuple)):
	raise TypeError(
	"Invalid value for attr: {} must be a number, "
	"a string, an ndarray or a list/tuple of "
	"numbers/strings for serialization to netCDF "
	"files".format(value)
	)

	# Check attrs on the dataset itself
	for k, v in dataset.attrs.items():
	check_attr(k, v)

	# Check attrs on each variable within the dataset
	for variable in dataset.variables.values():
	for k, v in variable.attrs.items():
	check_attr(k, v)


	def _protect_dataset_variables_inplace(dataset, cache):
	for name, variable in dataset.variables.items():
	if name not in variable.dims:
	# no need to protect IndexVariable objects
	data = indexing.CopyOnWriteArray(variable._data)
	if cache:
	data = indexing.MemoryCachedArray(data)
	variable.data = data


	def _finalize_store(write, store):
	""" Finalize this store by explicitly syncing and closing"""
	del write # ensure writing is done first
	store.close()


	def maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine, lock=False):
	ds = conventions.decode_cf(
	store,
	mask_and_scale=mask_and_scale,
	decode_times=decode_times,
	concat_characters=concat_characters,
	decode_coords=decode_coords,
	drop_variables=drop_variables,
	use_cftime=use_cftime,
	decode_timedelta=decode_timedelta,
	)

	_protect_dataset_variables_inplace(ds, cache)

	if chunks is not None:
	from dask.base import tokenize

	# if passed an actual file path, augment the token with
	# the file modification time
	if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
	mtime = os.path.getmtime(filename_or_obj)
	else:
	mtime = None
	token = tokenize(
	filename_or_obj,
	mtime,
	group,
	decode_cf,
	mask_and_scale,
	decode_times,
	concat_characters,
	decode_coords,
	engine,
	chunks,
	drop_variables,
	use_cftime,
	decode_timedelta,
	)
	name_prefix = "open_dataset-%s" % token
	ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
	ds2._file_obj = ds._file_obj
	else:
	ds2 = ds

	return ds2


	def open_dataset(
	filename_or_obj,
	group=None,
	decode_cf=True,
	mask_and_scale=None,
	decode_times=True,
	autoclose=None,
	concat_characters=True,
	decode_coords=True,
	engine=None,
	chunks=None,
	lock=None,
	cache=None,
	drop_variables=None,
	backend_kwargs=None,
	use_cftime=None,
	decode_timedelta=None,
	):
	"""Open and decode a dataset from a file or file-like object.
	Parameters
	----------
	filename_or_obj : str, Path, file or xarray.backends.*DataStore
	Strings and Path objects are interpreted as a path to a netCDF file
	or an OpenDAP URL and opened with python-netCDF4, unless the filename
	ends with .gz, in which case the file is gunzipped and opened with
	scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
	objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
	group : str, optional
	Path to the netCDF4 group in the given file to open (only works for
	netCDF4 files).
	decode_cf : bool, optional
	Whether to decode these variables, assuming they were saved according
	to CF conventions.
	mask_and_scale : bool, optional
	If True, replace array values equal to `_FillValue` with NA and scale
	values according to the formula `original_values * scale_factor +
	add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
	taken from variable attributes (if they exist). If the `_FillValue` or
	`missing_value` attribute contains multiple values a warning will be
	issued and all array values matching one of the multiple values will
	be replaced by NA. mask_and_scale defaults to True except for the
	pseudonetcdf backend.
	decode_times : bool, optional
	If True, decode times encoded in the standard NetCDF datetime format
	into datetime objects. Otherwise, leave them encoded as numbers.
	autoclose : bool, optional
	If True, automatically close files to avoid OS Error of too many files
	being open. However, this option doesn't work with streams, e.g.,
	BytesIO.
	concat_characters : bool, optional
	If True, concatenate along the last dimension of character arrays to
	form string arrays. Dimensions will only be concatenated over (and
	removed) if they have no corresponding variable and if they are only
	used as the last dimension of character arrays.
	decode_coords : bool, optional
	If True, decode the 'coordinates' attribute to identify coordinates in
	the resulting dataset.
	engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib', \
	'pseudonetcdf'}, optional
	Engine to use when reading files. If not provided, the default engine
	is chosen based on available dependencies, with a preference for
	'netcdf4'.
	chunks : int or dict, optional
	If chunks is provided, it used to load the new dataset into dask
	arrays. ``chunks={}`` loads the dataset with dask using a single
	chunk for all arrays.
	lock : False or duck threading.Lock, optional
	Resource lock to use when reading data from disk. Only relevant when
	using dask or another form of parallelism. By default, appropriate
	locks are chosen to safely read and write files with the currently
	active dask scheduler.
	cache : bool, optional
	If True, cache data loaded from the underlying datastore in memory as
	NumPy arrays when accessed to avoid reading from the underlying data-
	store multiple times. Defaults to True unless you specify the `chunks`
	argument to use dask, in which case it defaults to False. Does not
	change the behavior of coordinates corresponding to dimensions, which
	always load their data from disk into a ``pandas.Index``.
	drop_variables: string or iterable, optional
	A variable or list of variables to exclude from being parsed from the
	dataset. This may be useful to drop variables with problems or
	inconsistent values.
	backend_kwargs: dictionary, optional
	A dictionary of keyword arguments to pass on to the backend. This
	may be useful when backend options would improve performance or
	allow user control of dataset processing.
	use_cftime: bool, optional
	Only relevant if encoded dates come from a standard calendar
	(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
	specified). If None (default), attempt to decode times to
	``np.datetime64[ns]`` objects; if this is not possible, decode times to
	``cftime.datetime`` objects. If True, always decode times to
	``cftime.datetime`` objects, regardless of whether or not they can be
	represented using ``np.datetime64[ns]`` objects. If False, always
	decode times to ``np.datetime64[ns]`` objects; if this is not possible
	raise an error.
	decode_timedelta : bool, optional
	If True, decode variables and coordinates with time units in
	{'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'}
	into timedelta objects. If False, leave them encoded as numbers.
	If None (default), assume the same value of decode_time.
	Returns
	-------
	dataset : Dataset
	The newly created dataset.
	Notes
	-----
	``open_dataset`` opens the file with read-only access. When you modify
	values of a Dataset, even one linked to files on disk, only the in-memory
	copy you are manipulating in xarray is modified: the original file on disk
	is never touched.
	See Also
	--------
	open_mfdataset
	"""
	engines = [
	None,
	"netcdf4",
	"scipy",
	"pydap",
	"h5netcdf",
	"pynio",
	"cfgrib",
	"pseudonetcdf",
	]
	if engine not in engines:
	raise ValueError(
	"unrecognized engine for open_dataset: {}\n"
	"must be one of: {}".format(engine, engines)
	)

	if autoclose is not None:
	warnings.warn(
	"The autoclose argument is no longer used by "
	"xarray.open_dataset() and is now ignored; it will be removed in "
	"a future version of xarray. If necessary, you can control the "
	"maximum number of simultaneous open files with "
	"xarray.set_options(file_cache_maxsize=...).",
	FutureWarning,
	stacklevel=2,
	)

	if mask_and_scale is None:
	mask_and_scale = not engine == "pseudonetcdf"

	if not decode_cf:
	mask_and_scale = False
	decode_times = False
	concat_characters = False
	decode_coords = False
	decode_timedelta = False

	if cache is None:
	cache = chunks is None

	if backend_kwargs is None:
	backend_kwargs = {}

	if isinstance(filename_or_obj, Path):
	filename_or_obj = str(filename_or_obj)

	if isinstance(filename_or_obj, AbstractDataStore):
	store = filename_or_obj

	elif isinstance(filename_or_obj, str):
	filename_or_obj = _normalize_path(filename_or_obj)

	if engine is None:
	engine = _get_default_engine(filename_or_obj, allow_remote=True)
	if engine == "netcdf4":
	store = backends.NetCDF4DataStore.open(
	filename_or_obj, group=group, lock=lock, **backend_kwargs
	)

	elif engine == "scipy":
	store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
	elif engine == "pydap":
	store = backends.PydapDataStore.open(filename_or_obj, **backend_kwargs)
	elif engine == "h5netcdf":
	store = backends.H5NetCDFStore.open(
	filename_or_obj, group=group, lock=lock, **backend_kwargs
	)
	elif engine == "pynio":
	store = backends.NioDataStore(filename_or_obj, lock=lock, **backend_kwargs)
	elif engine == "pseudonetcdf":
	store = backends.PseudoNetCDFDataStore.open(
	filename_or_obj, lock=lock, **backend_kwargs
	)
	elif engine == "cfgrib":
	store = backends.CfGribDataStore(
	filename_or_obj, lock=lock, **backend_kwargs
	)

	else:
	if engine not in [None, "scipy", "h5netcdf"]:
	raise ValueError(
	"can only read bytes or file-like objects "
	"with engine='scipy' or 'h5netcdf'"
	)
	engine = _get_engine_from_magic_number(filename_or_obj)
	if engine == "scipy":
	store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
	elif engine == "h5netcdf":
	store = backends.H5NetCDFStore.open(
	filename_or_obj, group=group, lock=lock, **backend_kwargs
	)

	with close_on_error(store):
	ds = maybe_decode_store(store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, cache, chunks, filename_or_obj, group, decode_cf, engine)

	# Ensure source filename always stored in dataset object (GH issue #2550)
	if "source" not in ds.encoding:
	if isinstance(filename_or_obj, str):
	ds.encoding["source"] = filename_or_obj

	return ds
No results found