Skip to content

Instantly share code, notes, and snippets.

@spencerkclark
Last active January 30, 2017 01:17
Show Gist options
  • Save spencerkclark/c33eaf241e52f2fb6ab877c64961de9d to your computer and use it in GitHub Desktop.
Save spencerkclark/c33eaf241e52f2fb6ab877c64961de9d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "Updated Experimental NetCDFTimeIndex\n===================================="
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "import re\n\nimport netcdftime\nimport numpy as np\nimport pandas as pd\nimport xarray as xr\n\nfrom datetime import timedelta\nfrom dateutil import parser\nfrom pandas.core.common import _maybe_box, _values_from_object\nfrom pandas.types.common import is_scalar",
"execution_count": 1,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Add Stephan's ISO 8601 Date Parser\n----------------------------------\n- Sidesteps issues in using dateutil's date parser for unusual low-year date strings"
},
{
"metadata": {
"collapsed": true,
"trusted": true
},
"cell_type": "code",
"source": "# Implement Stephan's ISO 8601 date parser\ndef named(name, pattern):\n return '(?P<' + name + '>' + pattern + ')'\n\n\ndef optional(x):\n return '(?:' + x + ')?'\n\n\ndef trailing_optional(xs):\n if not xs:\n return ''\n return xs[0] + optional(trailing_optional(xs[1:]))\n\n\ndef build_pattern(date_sep='\\-', datetime_sep='T', time_sep='\\:'):\n pieces = [(None, 'year', '\\d{4}'),\n (date_sep, 'month', '\\d{2}'),\n (date_sep, 'day', '\\d{2}'),\n (datetime_sep, 'hour', '\\d{2}'),\n (time_sep, 'minute', '\\d{2}'),\n (time_sep, 'second', '\\d{2}' + optional('\\.\\d+'))]\n pattern_list = []\n for sep, name, sub_pattern in pieces:\n pattern_list.append((sep if sep else '') + named(name, sub_pattern))\n # TODO: allow timezone offsets?\n return '^' + trailing_optional(pattern_list) + '$'\n\n\ndef parse_iso8601(datetime_string):\n basic_pattern = build_pattern(date_sep='', time_sep='')\n extended_pattern = build_pattern()\n patterns = [basic_pattern, extended_pattern]\n for pattern in patterns:\n match = re.match(pattern, datetime_string)\n if match:\n return match.groupdict()\n raise ValueError('no ISO-8601 match for string: %s' % datetime_string)",
"execution_count": 2,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Implement Custom NetCDFTimeIndex\n--------------------------------\n- Field accessors -> groupby support\n- Partial datetime string indexing (with ISO 8601 compliant date strings)\n- Use in a Series or DataFrame"
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "_NETCDFTIME_TYPES = (netcdftime.Datetime360Day, netcdftime.DatetimeAllLeap,\n netcdftime.DatetimeGregorian, netcdftime.DatetimeJulian,\n netcdftime.DatetimeNoLeap, netcdftime.DatetimeProlepticGregorian)\n\ndef get_date_field(datetimes, field):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tslib.pyx#L4564\"\"\"\n return [getattr(date, field) for date in datetimes]\n\n\ndef _field_accessor(name, docstring=None):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L63\"\"\"\n def f(self):\n return get_date_field(self._data, name)\n\n f.__name__ = name\n f.__doc__ = docstring\n return property(f)\n\nclass NetCDFTimeIndex(pd.Index):\n def __new__(cls, data):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L246\"\"\"\n result = object.__new__(cls)\n result._data = np.array(data)\n return result\n\n year = _field_accessor('year', 'The year of the datetime')\n month = _field_accessor('month', 'The month of the datetime')\n day = _field_accessor('day', 'The days of the datetime')\n hour = _field_accessor('hour', 'The hours of the datetime')\n minute = _field_accessor('minute', 'The minutes of the datetime')\n second = _field_accessor('second', 'The seconds of the datetime')\n microsecond = _field_accessor('microsecond',\n 'The microseconds of the datetime')\n \n def parse_iso8601_with_reso(self, dtype, timestr):\n default = dtype(1, 1, 1)\n result = parse_iso8601(timestr)\n replace = {}\n\n for attr in ['year', 'month', 'day', 'hour', 'minute', 'second']:\n value = result.get(attr, None)\n if value is not None:\n replace[attr] = int(value)\n resolution = attr\n\n return default.replace(**replace), resolution\n \n def _parsed_string_to_bounds(self, dtype, resolution, parsed):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L1229\"\"\"\n if resolution == 'year':\n return (dtype(parsed.year, 1, 1),\n dtype(parsed.year + 1, 1, 1) - timedelta(microseconds=1))\n if resolution == 'month':\n if parsed.month == 12:\n end = dtype(parsed.year + 1, 1, 1) - timedelta(microseconds=1)\n else:\n end = (dtype(parsed.year, parsed.month + 1, 1) -\n timedelta(microseconds=1))\n return dtype(parsed.year, parsed.month, 1), end\n if resolution == 'day':\n start = dtype(parsed.year, parsed.month, parsed.day)\n return start, start + timedelta(days=1, microseconds=-1)\n if resolution == 'hour':\n start = dtype(parsed.year, parsed.month, parsed.day, parsed.hour)\n return start, start + timedelta(hours=1, microseconds=-1)\n if resolution == 'minute':\n start = dtype(parsed.year, parsed.month, parsed.day, parsed.hour,\n parsed.minute)\n return start, start + timedelta(minutes=1, microseconds=-1)\n if resolution == 'second':\n start = dtype(parsed.year, parsed.month, parsed.day, parsed.hour,\n parsed.minute, parsed.second)\n return start, start + timedelta(seconds=1, microseconds=-1)\n else:\n raise KeyError\n\n def _partial_date_slice(self, dtype, resolution, parsed,\n use_lhs=True, use_rhs=True):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L1294\"\"\"\n start, end = self._parsed_string_to_bounds(dtype, resolution, parsed)\n lhs_mask = (self._data >= start) if use_lhs else True\n rhs_mask = (self._data <= end) if use_rhs else True\n return (lhs_mask & rhs_mask).nonzero()[0]\n\n def _get_string_slice(self, key, use_lhs=True, use_rhs=True):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L1462\"\"\"\n dtype = type(self._data[0])\n parsed, resolution = self.parse_iso8601_with_reso(dtype, key)\n loc = self._partial_date_slice(dtype, resolution, parsed, use_lhs,\n use_rhs)\n return loc\n \n def get_loc(self, key, method=None, tolerance=None):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L1383\"\"\"\n if isinstance(key, pd.compat.string_types):\n return self._get_string_slice(key)\n else:\n return pd.Index.get_loc(self, key, method=method,\n tolerance=tolerance)\n\n def _maybe_cast_slice_bound(self, label, side, kind):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L1422\"\"\"\n if isinstance(label, pd.compat.string_types):\n dtype = type(self._data[0])\n parsed, resolution = self.parse_iso8601_with_reso(dtype, label)\n start, end = self._parsed_string_to_bounds(dtype, resolution,\n parsed)\n if self.is_monotonic_decreasing and len(self):\n return end if side == 'left' else start\n return start if side == 'left' else end\n else:\n return label\n \n # Enable use in a Series or DataFrame\n def get_value(self, series, key):\n \"\"\"https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L1340\"\"\"\n if isinstance(key, _NETCDFTIME_TYPES):\n return self.get_value_maybe_box(series, key)\n \n try:\n return _maybe_box(self, pd.Index.get_value(self, series, key),\n series, key)\n except KeyError:\n try:\n loc = self._get_string_slice(key)\n return series[loc]\n except (TypeError, ValueError, KeyError, AttributeError):\n pass\n \n try:\n return self.get_value_maybe_box(series, key)\n except (TypeError, ValueError, KeyError):\n raise KeyError(key)\n \n def get_value_maybe_box(self, series, key):\n \"\"\"Adapted from https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/index.py#L1373\"\"\"\n values = self._engine.get_value(_values_from_object(series),\n key)\n return _maybe_box(self, values, series, key)\n \n # Needed for loc method of indexing in Series or DataFrame\n def __contains__(self, key):\n \"\"\"https://github.com/pandas-dev/pandas/blob/master/pandas/tseries/base.py#L233\"\"\"\n try:\n res = self.get_loc(key)\n return is_scalar(res) or type(res) == slice or np.any(res)\n except (KeyError, TypeError, ValueError):\n return False",
"execution_count": 3,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Examples (using DatetimeAllLeap)\n--------------------------------"
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "dates_0001 = [netcdftime.DatetimeAllLeap(1, m, 1) for m in range(1, 13)]\ndates_0002 = [netcdftime.DatetimeAllLeap(2, m, 1) for m in range(1, 13)]",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"collapsed": true,
"trusted": true
},
"cell_type": "code",
"source": "da = xr.DataArray(np.arange(35, 35 + 24), coords=[NetCDFTimeIndex(dates_0001 + dates_0002)], dims=['time'])",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "da.groupby('time.month').mean('time')",
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "<xarray.DataArray (month: 12)>\narray([ 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51.,\n 52.])\nCoordinates:\n * month (month) int64 1 2 3 4 5 6 7 8 9 10 11 12"
},
"metadata": {},
"execution_count": 6
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "da.sel(time='0001')",
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "<xarray.DataArray (time: 12)>\narray([35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46])\nCoordinates:\n * time (time) object 1-01-01 00:00:00 1-02-01 00:00:00 ..."
},
"metadata": {},
"execution_count": 7
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "da.sel(time=slice('0001', '0002'))",
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "<xarray.DataArray (time: 24)>\narray([35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,\n 52, 53, 54, 55, 56, 57, 58])\nCoordinates:\n * time (time) object 1-01-01 00:00:00 1-02-01 00:00:00 ..."
},
"metadata": {},
"execution_count": 8
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "da.sel(time=slice('0001-01', '0001-05'))",
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "<xarray.DataArray (time: 5)>\narray([35, 36, 37, 38, 39])\nCoordinates:\n * time (time) object 1-01-01 00:00:00 1-02-01 00:00:00 ..."
},
"metadata": {},
"execution_count": 9
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "da.sel(time=slice('0001-01-01', '0001-02-29'))",
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "<xarray.DataArray (time: 2)>\narray([35, 36])\nCoordinates:\n * time (time) object 1-01-01 00:00:00 1-02-01 00:00:00"
},
"metadata": {},
"execution_count": 10
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "da.sel(time=slice(netcdftime.DatetimeAllLeap(1, 1, 1), netcdftime.DatetimeAllLeap(1, 5, 1)))",
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "<xarray.DataArray (time: 5)>\narray([35, 36, 37, 38, 39])\nCoordinates:\n * time (time) object 1-01-01 00:00:00 1-02-01 00:00:00 ..."
},
"metadata": {},
"execution_count": 11
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Try in a Series\n---------------"
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "series = pd.Series(np.arange(35, 35 + 24), index=NetCDFTimeIndex(dates_0001 + dates_0002))",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "series['0001']",
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " 1-01-01 00:00:00 35\n 1-02-01 00:00:00 36\n 1-03-01 00:00:00 37\n 1-04-01 00:00:00 38\n 1-05-01 00:00:00 39\n 1-06-01 00:00:00 40\n 1-07-01 00:00:00 41\n 1-08-01 00:00:00 42\n 1-09-01 00:00:00 43\n 1-10-01 00:00:00 44\n 1-11-01 00:00:00 45\n 1-12-01 00:00:00 46\ndtype: int64"
},
"metadata": {},
"execution_count": 13
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "series[netcdftime.DatetimeAllLeap(1, 2, 1)]",
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "36"
},
"metadata": {},
"execution_count": 14
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "series[1]",
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "36"
},
"metadata": {},
"execution_count": 15
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "series.loc['0001']",
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " 1-01-01 00:00:00 35\n 1-02-01 00:00:00 36\n 1-03-01 00:00:00 37\n 1-04-01 00:00:00 38\n 1-05-01 00:00:00 39\n 1-06-01 00:00:00 40\n 1-07-01 00:00:00 41\n 1-08-01 00:00:00 42\n 1-09-01 00:00:00 43\n 1-10-01 00:00:00 44\n 1-11-01 00:00:00 45\n 1-12-01 00:00:00 46\ndtype: int64"
},
"metadata": {},
"execution_count": 16
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "series.iloc[1]",
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "36"
},
"metadata": {},
"execution_count": 17
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "series.iloc[1:4]",
"execution_count": 18,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " 1-02-01 00:00:00 36\n 1-03-01 00:00:00 37\n 1-04-01 00:00:00 38\ndtype: int64"
},
"metadata": {},
"execution_count": 18
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Try in a DataFrame\n------------------"
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "df = pd.DataFrame(np.arange(35, 35 + 24), index=NetCDFTimeIndex(dates_0001 + dates_0002))",
"execution_count": 19,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "df.loc['0001']",
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1-01-01 00:00:00</th>\n <td>35</td>\n </tr>\n <tr>\n <th>1-02-01 00:00:00</th>\n <td>36</td>\n </tr>\n <tr>\n <th>1-03-01 00:00:00</th>\n <td>37</td>\n </tr>\n <tr>\n <th>1-04-01 00:00:00</th>\n <td>38</td>\n </tr>\n <tr>\n <th>1-05-01 00:00:00</th>\n <td>39</td>\n </tr>\n <tr>\n <th>1-06-01 00:00:00</th>\n <td>40</td>\n </tr>\n <tr>\n <th>1-07-01 00:00:00</th>\n <td>41</td>\n </tr>\n <tr>\n <th>1-08-01 00:00:00</th>\n <td>42</td>\n </tr>\n <tr>\n <th>1-09-01 00:00:00</th>\n <td>43</td>\n </tr>\n <tr>\n <th>1-10-01 00:00:00</th>\n <td>44</td>\n </tr>\n <tr>\n <th>1-11-01 00:00:00</th>\n <td>45</td>\n </tr>\n <tr>\n <th>1-12-01 00:00:00</th>\n <td>46</td>\n </tr>\n </tbody>\n</table>\n</div>",
"text/plain": " 0\n 1-01-01 00:00:00 35\n 1-02-01 00:00:00 36\n 1-03-01 00:00:00 37\n 1-04-01 00:00:00 38\n 1-05-01 00:00:00 39\n 1-06-01 00:00:00 40\n 1-07-01 00:00:00 41\n 1-08-01 00:00:00 42\n 1-09-01 00:00:00 43\n 1-10-01 00:00:00 44\n 1-11-01 00:00:00 45\n 1-12-01 00:00:00 46"
},
"metadata": {},
"execution_count": 20
}
]
},
{
"metadata": {
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "df.loc['0001-01-01':'0001-02-01']",
"execution_count": 21,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1-01-01 00:00:00</th>\n <td>35</td>\n </tr>\n <tr>\n <th>1-02-01 00:00:00</th>\n <td>36</td>\n </tr>\n </tbody>\n</table>\n</div>",
"text/plain": " 0\n 1-01-01 00:00:00 35\n 1-02-01 00:00:00 36"
},
"metadata": {},
"execution_count": 21
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "df.iloc[1]",
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "0 36\nName: 1-02-01 00:00:00, dtype: int64"
},
"metadata": {},
"execution_count": 22
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "df.iloc[1:4]",
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": "<div>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1-02-01 00:00:00</th>\n <td>36</td>\n </tr>\n <tr>\n <th>1-03-01 00:00:00</th>\n <td>37</td>\n </tr>\n <tr>\n <th>1-04-01 00:00:00</th>\n <td>38</td>\n </tr>\n </tbody>\n</table>\n</div>",
"text/plain": " 0\n 1-02-01 00:00:00 36\n 1-03-01 00:00:00 37\n 1-04-01 00:00:00 38"
},
"metadata": {},
"execution_count": 23
}
]
}
],
"metadata": {
"kernelspec": {
"name": "python2",
"display_name": "Python 2",
"language": "python"
},
"language_info": {
"mimetype": "text/x-python",
"nbconvert_exporter": "python",
"name": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12",
"file_extension": ".py",
"codemirror_mode": {
"version": 2,
"name": "ipython"
}
},
"gist_id": "c33eaf241e52f2fb6ab877c64961de9d"
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment