Skip to content

Instantly share code, notes, and snippets.

@benbovy
Created November 13, 2016 02:01
Show Gist options
  • Save benbovy/92e7c76220af1aaa4b3a0b65374e233a to your computer and use it in GitHub Desktop.
Save benbovy/92e7c76220af1aaa4b3a0b65374e233a to your computer and use it in GitHub Desktop.
Dataset node
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hierarchical collection of xarray datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A class `DatasetNode`, inspired from `h5py.Group`. A notable difference if that the first class accepts only one `xarray.Dataset` object rather than of a collection of `h5py.Dataset` objects for the second class. `h5py.Dataset` objects are close to `xarray.Variable` objects and `xarray.Dataset` objects are already collections of `xarray.Variable` objects.\n",
"\n",
"Design questions:\n",
"\n",
"- Immutable or mutable attached `dataset`?\n",
"\n",
"- Immutable or mutable `name`?\n",
"\n",
"- Immutable or mutable `parent`?.\n",
"\n",
"- Do we need to allow `DatasetNode` to have user-defined metadata (e.g., 'datasource')? Or a better place would be global attributes of its attached dataset?\n",
"\n",
"- Feature: create a 1-depth tree of datasets from splitting a dataset with advanced indexing, apply a function to all datasets and then merge back the results into a single dataset (i.e., kind of \"map/reduce\" operation at the xarray.Dataset level of abstraction). Any use case for this?\n",
"\n",
"\n",
"TODO: tree manipulation (delete, copy or move nodes...) if mutable.\n",
"\n",
"- Delete a node: either (1) delete all child nodes recursively or (2) update parent of all direct child nodes? Option 1 seems better and more consistent with h5py and netCDF4. Option 2 only if parent is mutable. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from collections import MutableMapping\n",
"\n",
"from xarray.core.common import AttrAccessMixin\n",
"from xarray.core.dataset import Dataset\n",
"from xarray.core.pycompat import basestring\n",
"\n",
"\n",
"class DatasetNode(MutableMapping, AttrAccessMixin):\n",
" \"\"\"Node of a hierarchical collection of Dataset objects.\"\"\"\n",
"\n",
" def __init__(self, name, parent=None, dataset=None):\n",
" \"\"\"Create a dataset node.\n",
" \n",
" Parameters\n",
" ----------\n",
" name : str\n",
" Name of the node (ignored if root node).\n",
" It can't contain any '/' character.\n",
" parent : DatasetNode object or None\n",
" Parent node (root node if None).\n",
" dataset : Dataset object\n",
" Dataset to be attached to this node.\n",
" \n",
" \"\"\"\n",
" if (not isinstance(name, basestring) or '/' in name):\n",
" raise ValueError(\"invalid name %r\" % name)\n",
" self._name = name\n",
" \n",
" self._children = []\n",
" \n",
" # TODO: move this to @parent.setter if parent is mutable\n",
" if parent is None:\n",
" self._name = ''\n",
" if parent is not None:\n",
" if not isinstance(parent, type(self)):\n",
" raise ValueError(\n",
" \"cannot set parent: must be a %r object, not %r\"\n",
" % (type(self).__name__, type(obj).__name__))\n",
" #if parent == self:\n",
" # # does not work when both self and parent has no children \n",
" # raise ValueError(\"cannot set parent: \"\n",
" # \"a group cannot be parent of itself\")\n",
" # TODO: check tree integrity (parent is in children)\n",
" # needed only if parent is mutable\n",
" if self._name in [c._name for c in parent._children]:\n",
" raise ValueError(\"cannot set parent: parent node %r \"\n",
" \"has already a child node named %r\"\n",
" % (parent._name, self._name))\n",
" parent._children.append(self)\n",
" self._parent = parent\n",
" \n",
" if dataset is None:\n",
" dataset = Dataset()\n",
" if not isinstance(dataset, Dataset):\n",
" raise ValueError('%r object is not a Dataset'\n",
" % type(dataset).__name__)\n",
" self._dataset = dataset\n",
" \n",
" @property\n",
" def parent(self):\n",
" return self._parent\n",
" \n",
" @property\n",
" def children(self):\n",
" return tuple(self._children)\n",
" \n",
" @property\n",
" def _attr_sources(self):\n",
" return [self]\n",
" \n",
" @property\n",
" def name(self):\n",
" \"\"\"Base name of this node.\"\"\"\n",
" return self._name\n",
" \n",
" @property\n",
" def dataset(self):\n",
" return self._dataset\n",
" \n",
" def _walk_parents(self):\n",
" \"\"\"Walk through this node and its parents.\"\"\"\n",
" yield self\n",
" node = self._parent\n",
" while node is not None:\n",
" yield node\n",
" node = node._parent\n",
" \n",
" def _walk_children(self):\n",
" \"\"\"Recursively walk through this node and all its child nodes.\"\"\"\n",
" yield self\n",
" for child in self._children:\n",
" for node in child._walk_children():\n",
" yield node\n",
" \n",
" @property\n",
" def path(self):\n",
" \"\"\"Full path to this node, given as a UNIX-like path.\"\"\"\n",
" if self._parent is None:\n",
" return '/'\n",
" else:\n",
" path_items = [dsg._name for dsg in self._walk_parents()]\n",
" return '/'.join(path_items[-1::-1])\n",
" \n",
" @property\n",
" def root_node(self):\n",
" \"\"\"Return the root node in the tree.\"\"\"\n",
" for node in self._walk_parents():\n",
" pass\n",
" return node\n",
" \n",
" def add_node(self, name, dataset=None):\n",
" \"\"\"Add a child node.\"\"\"\n",
" return type(self)(name, parent=self, dataset=dataset)\n",
" \n",
" def _get_node_depth1(self, node, key):\n",
" if node is None:\n",
" return None\n",
" if key == '..':\n",
" return node._parent\n",
" if key == '.':\n",
" return node \n",
" for child in node._children:\n",
" if key == child._name:\n",
" return child\n",
" return None\n",
" \n",
" def get(self, path, default=None):\n",
" \"\"\"Return a node given any relative or absolute\n",
" UNIX-like path.\n",
"\n",
" \"\"\"\n",
" if path == '/':\n",
" return self.root_node\n",
" elif path.startswith('/'):\n",
" node = self.root_node\n",
" path = path[1:]\n",
" else:\n",
" node = self\n",
"\n",
" for key in path.split('/'):\n",
" node = self._get_node_depth1(node, key)\n",
" if node is None:\n",
" node = default\n",
" \n",
" return node\n",
"\n",
" def __getitem__(self, key):\n",
" node = self.get(key)\n",
" if node is None:\n",
" raise KeyError('node %r not found' % key)\n",
" return node\n",
"\n",
" def __setitem__(self, key, value):\n",
" # TODO: decide what to do here.\n",
" raise TypeError(\"not allowed to add/overwrite a node \",\n",
" \"with setitem. Use '.add_node()' instead\")\n",
" \n",
" def __delitem__(self, key):\n",
" # TODO: decide what to do here. \n",
" raise TypeError()\n",
" \n",
" def __iter__(self):\n",
" return iter((c._name for c in self._children))\n",
" \n",
" def __len__(self):\n",
" return len(self._children)\n",
" \n",
" def apply_datasets(self, func, recursive=False):\n",
" \"\"\"\n",
" Apply a function to dataset of this node and\n",
" datasets of all of its child nodes.\n",
" \n",
" Parameters\n",
" ----------\n",
" func : callable\n",
" Function to apply to datasets with signature:\n",
" `func(name, dataset) -> None or return value`.\n",
" recursive : True\n",
" If True, apply the function to all child nodes\n",
" recursively.\n",
" \"\"\"\n",
" if not recursive:\n",
" children_nodes = self._children\n",
" else:\n",
" children_nodes = self._walk_children\n",
" \n",
" for node in children_nodes:\n",
" yield func(node._name, node._dataset)\n",
" \n",
" def merge_datasets(self, recursive=False):\n",
" \"\"\"Return a dataset from merging datasets\n",
" of this node and of all of its child nodes.\n",
" \"\"\"\n",
" # TODO:\n",
" pass\n",
" \n",
" def __repr__(self):\n",
" return \"<%s %r (%d nodes)>\\n%s\" % (\n",
" type(self).__name__, self.path,\n",
" len(self), repr(self.dataset)[17:])\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Create a tree of Dataset objects from groups in a netCDF4 file.\n",
"# This is just for the examples below. Not very nice to call\n",
"# xr.open_dataset() and Dataset.load() to load data from each group.\n",
"\n",
"import os\n",
"import netCDF4\n",
"import xarray as xr\n",
"\n",
"def load_children_recursive(filename, node, ncgroup):\n",
" for g in ncgroup.groups.values():\n",
" name = os.path.basename(g.path)\n",
" ds = xr.open_dataset(filename, group=g.path).load()\n",
" node.add_node(name, dataset=ds)\n",
" load_children_recursive(filename, node[name], g)\n",
"\n",
"def open_dataset_as_nodes(filename):\n",
" with netCDF4.Dataset(filename, mode='r') as ncfile:\n",
" ds = xr.open_dataset(filename).load()\n",
" tree_root = DatasetNode('', dataset=ds)\n",
" load_children_recursive(filename, tree_root, ncfile)\n",
" return tree_root"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example: loading all groups in a netCDF file into a tree of Dataset objects\n",
"\n",
"Example file `test_hgroups.nc` downloaded from http://www.unidata.ucar.edu/software/netcdf/examples/files.html "
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dstree = open_dataset_as_nodes('data/test_hgroups.nc')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/' (7 nodes)>\n",
"Dimensions: (recNum: 74)\n",
"Coordinates:\n",
" * recNum (recNum) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ...\n",
"Data variables:\n",
" UTC_time (recNum) object '2012-03-04 03:54:19' '2012-03-04 03:54:42' ..."
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dstree['/']"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<xarray.Dataset>\n",
"Dimensions: (recNum: 74)\n",
"Coordinates:\n",
" * recNum (recNum) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ...\n",
"Data variables:\n",
" UTC_time (recNum) object '2012-03-04 03:54:19' '2012-03-04 03:54:42' ..."
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dstree.dataset"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['mozaic_flight_2012030403540535_ascent',\n",
" 'mozaic_flight_2012030321335035_descent',\n",
" 'mozaic_flight_2012030403540535_descent',\n",
" 'mozaic_flight_2012030412545335_ascent',\n",
" 'mozaic_flight_2012030419144751_ascent',\n",
" 'mozaic_flight_2012030319051051_descent',\n",
" 'mozaic_flight_2012030421382353_ascent']"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[name for name in dstree]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/mozaic_flight_2012030319051051_descent' (0 nodes)>\n",
"Dimensions: (air_press: 78)\n",
"Coordinates:\n",
" * air_press (air_press) float64 1.005e+05 9.86e+04 9.696e+04 9.51e+04 ...\n",
"Data variables:\n",
" CO (air_press) float64 -99.0 -99.0 -99.0 -99.0 -99.0 -99.0 -99.0 ...\n",
" O3 (air_press) float64 3.0 12.0 19.0 27.0 29.0 34.0 33.0 33.0 ...\n",
" altitude (air_press) float64 70.8 229.3 369.6 531.4 673.8 825.2 964.9 ...\n",
" UTC_time (air_press) object '2012-03-04 04:46:11' ...\n",
" lat float64 50.03\n",
" lon float64 8.543\n",
"Attributes:\n",
" airport_dep: WDH\n",
" flight: 2012030319051051\n",
" level: calibrated\n",
" airport_arr: FRA\n",
" mission: mozaic\n",
" time_dep: 2012-03-03 07:05:10\n",
" aircraft: 2\n",
" link: http://www.iagos.fr/extract\n",
" phase: descent\n",
" time_arr: 2012-03-04 04:46:40"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dstree.mozaic_flight_2012030319051051_descent"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/mozaic_flight_2012030321335035_descent' (0 nodes)>\n",
"Dimensions: (air_press: 76)\n",
"Coordinates:\n",
" * air_press (air_press) float64 1.007e+05 9.873e+04 9.688e+04 9.521e+04 ...\n",
"Data variables:\n",
" CO (air_press) float64 187.0 194.0 186.0 169.0 181.0 152.0 153.0 ...\n",
" O3 (air_press) float64 -99.0 -99.0 -99.0 -99.0 -99.0 -99.0 -99.0 ...\n",
" altitude (air_press) float64 54.3 218.3 376.9 521.9 699.1 835.1 977.5 ...\n",
" UTC_time (air_press) object '2012-03-04 01:04:37' ...\n",
" lat float64 32.01\n",
" lon float64 34.89\n",
"Attributes:\n",
" airport_dep: FRA\n",
" flight: 2012030321335035\n",
" level: calibrated\n",
" airport_arr: TLV\n",
" mission: mozaic\n",
" time_dep: 2012-03-03 09:33:50\n",
" aircraft: 3\n",
" link: http://www.iagos.fr/extract\n",
" phase: descent\n",
" time_arr: 2012-03-04 01:05:08"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dstree.mozaic_flight_2012030321335035_descent"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic tests (tree building and browsing, no dataset) "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"root = DatasetNode('root')\n",
"node1 = DatasetNode('node1', parent=root)\n",
"node2 = DatasetNode('node2', parent=root)\n",
"cnode1 = node1.add_node('cnode1')\n",
"ccnode1 = cnode1.add_node('ccnode1')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/' (2 nodes)>\n",
"Dimensions: ()\n",
"Coordinates:\n",
" *empty*\n",
"Data variables:\n",
" *empty*"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"root"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/node1/cnode1/ccnode1' (0 nodes)>\n",
"Dimensions: ()\n",
"Coordinates:\n",
" *empty*\n",
"Data variables:\n",
" *empty*"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ccnode1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/node1' (1 nodes)>\n",
"Dimensions: ()\n",
"Coordinates:\n",
" *empty*\n",
"Data variables:\n",
" *empty*"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ccnode1.parent.parent.parent.node1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/node1' (1 nodes)>\n",
"Dimensions: ()\n",
"Coordinates:\n",
" *empty*\n",
"Data variables:\n",
" *empty*"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"root['node1']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/node2' (0 nodes)>\n",
"Dimensions: ()\n",
"Coordinates:\n",
" *empty*\n",
"Data variables:\n",
" *empty*"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ccnode1['../../../node2']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/' (2 nodes)>\n",
"Dimensions: ()\n",
"Coordinates:\n",
" *empty*\n",
"Data variables:\n",
" *empty*"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ccnode1['/']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<DatasetNode '/node2' (0 nodes)>\n",
"Dimensions: ()\n",
"Coordinates:\n",
" *empty*\n",
"Data variables:\n",
" *empty*"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ccnode1['/node2']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['node1', 'node2']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[cname for cname in root]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'node1' in root"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(<DatasetNode '/node1' (1 nodes)>\n",
" Dimensions: ()\n",
" Coordinates:\n",
" *empty*\n",
" Data variables:\n",
" *empty*, <DatasetNode '/node2' (0 nodes)>\n",
" Dimensions: ()\n",
" Coordinates:\n",
" *empty*\n",
" Data variables:\n",
" *empty*)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"root.children"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['node1', 'node2']"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[n for n in root.apply_datasets(lambda name, ds: name)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [xarray_dev_py35]",
"language": "python",
"name": "Python [xarray_dev_py35]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment