flamingbear · April 25, 2015 21:07
diff --git a/Demonstrate_Rolling_Average.ipynb b/Demonstrate_Rolling_Average.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Demonstrate rolling average for NSIDC timeseries data\n",
    "\n",
    "Given a timeseries with sometimes irregular or variable timeperiods between data points,\n",
    "compute an N day rolling mean, where M values must be observed to have a valid data point.\n",
    "\n",
    "N = number of days to average over\n",
    "M = number of valid data observations to get data\n",
    "\n",
    "The input data format is just a date and extent for each day we have data.\n",
    "```\n",
    "Year, Month, Day,     Extent,    Missing, Source Data\n",
    "YYYY,    MM,  DD, 10^6 sq km, 10^6 sq km, Source data product web site: http://nsidc.org/d....\n",
    "1978,    10,  26,     10.231,      0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....\n",
    "1978,    10,  28,     10.420,      0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....\n",
    "1978,    10,  30,     10.557,      0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....\n",
    "....\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "import datetime as dt\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "%pylab inline\n",
    "import pandas as pd\n",
    "pd.options.display.mpl_style = 'default'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "code to read the CSV files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\n",
    "def parse_the_date(year, mm, dd):\n",
    "    return dt.date(int(year), int(mm), int(dd))\n",
    "\n",
    "def slurp_csv(filename):\n",
    "    data = pd.read_csv(filename, header = None, skiprows=2,\n",
    "                       names=[\"year\", \"mm\", \"dd\", \"extent\", \"missing\", \"source\"],\n",
    "                       parse_dates={'date':['year', 'mm', 'dd']},\n",
    "                       date_parser=parse_the_date, index_col='date')\n",
    "    data = data.drop('missing', axis=1)\n",
    "    return data\n",
    "\n",
    "def read_a_hemisphere(hemisphere):\n",
    "    production_topdir = '/projects/DATASETS/NOAA/G02135/'\n",
    "    final_prod_filename = os.path.join(production_topdir, '{hemisphere}/daily/data/{hemi}H_seaice_extent_final.csv'.format(hemisphere=hemisphere, hemi=hemisphere[0:1].upper()))\n",
    "    nrt_prod_filename = os.path.join(production_topdir, '{hemisphere}/daily/data/{hemi}H_seaice_extent_nrt.csv'.format(hemisphere=hemisphere, hemi=hemisphere[0:1].upper()))\n",
    "\n",
    "    final = slurp_csv(final_prod_filename)\n",
    "    nrt = slurp_csv(nrt_prod_filename)\n",
    "    all_data = pd.concat([final, nrt])\n",
    "    return all_data\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "Read CSV data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "north = read_a_hemisphere('north')\n",
    "north.index = north.index.to_datetime()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<class 'pandas.tseries.index.DatetimeIndex'>\n",
       "[1978-10-26, ..., 2015-04-24]\n",
       "Length: 11700, Freq: None, Timezone: None"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "north.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "north = north.drop('source',axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "Now we just have Extent and a datetime index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>extent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1978-10-26</th>\n",
       "      <td>10.231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-28</th>\n",
       "      <td>10.420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-30</th>\n",
       "      <td>10.557</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-01</th>\n",
       "      <td>10.670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-03</th>\n",
       "      <td>10.787</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            extent\n",
       "1978-10-26  10.231\n",
       "1978-10-28  10.420\n",
       "1978-10-30  10.557\n",
       "1978-11-01  10.670\n",
       "1978-11-03  10.787"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "north.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "We could simply do a rolling average here with a number of observations:\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>extent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1978-10-26</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-28</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-30</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-01</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-03</th>\n",
       "      <td>10.533</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            extent\n",
       "1978-10-26     NaN\n",
       "1978-10-28     NaN\n",
       "1978-10-30     NaN\n",
       "1978-11-01     NaN\n",
       "1978-11-03  10.533"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.rolling_mean(north, window=5).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "And we can relax the number of data points needed to have a valid mean by changing min_periods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>extent</th>\n",
       "      <th>5day averaged</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1978-10-26</th>\n",
       "      <td>10.231</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-28</th>\n",
       "      <td>10.420</td>\n",
       "      <td>10.325500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-30</th>\n",
       "      <td>10.557</td>\n",
       "      <td>10.402667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-01</th>\n",
       "      <td>10.670</td>\n",
       "      <td>10.469500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-03</th>\n",
       "      <td>10.787</td>\n",
       "      <td>10.533000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            extent  5day averaged\n",
       "1978-10-26  10.231            NaN\n",
       "1978-10-28  10.420      10.325500\n",
       "1978-10-30  10.557      10.402667\n",
       "1978-11-01  10.670      10.469500\n",
       "1978-11-03  10.787      10.533000"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "north['5day averaged'] = pd.rolling_mean(north, window=5, min_periods=2)\n",
    "north.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "But what we see is that the window and period in the `rolling_mean` function,\n",
    "doesn't account for actual days, but rather just observations.\n",
    "\n",
    "so the 5day average for 1978-11-03 is incorrectly given as:\n",
    "\n",
    "     ( 10.231 + 10.420 + 10.557 + 10.670 + 10.787) / 5 = 10.533"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10.533"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "( 10.231 + 10.420 + 10.557 +  10.670 +  10.787) / 5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "We really should be just averaging 5 Days and ignoring the two days that the sensor was turned off:\n",
    "\n",
    "     1978-10-30  10.557\n",
    "     1978-10-31  missing\n",
    "     1978-11-01  10.670\n",
    "     1978-11-02  missing\n",
    "     1978-11-03  10.787\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10.671333333333335"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(10.557 + 10.670 + 10.787) / 3\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "To fix this we need to set up a regularly spaced datetimeindex for the extent data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# get rid of the incorrectly averaged data\n",
    "north = north.drop('5day averaged', axis=1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "create a fake timeseries to align the missing data in the SMMR ERA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ts = pd.Series(NaN, index=pd.date_range('10/25/1978', '4/20/2015'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "---\n",
    "Now align the existing extents with the new regularly spaced data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>extent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1978-10-25</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-26</th>\n",
       "      <td>10.231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-27</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-28</th>\n",
       "      <td>10.420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-29</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-30</th>\n",
       "      <td>10.557</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-31</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-01</th>\n",
       "      <td>10.670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-02</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-03</th>\n",
       "      <td>10.787</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            extent\n",
       "1978-10-25     NaN\n",
       "1978-10-26  10.231\n",
       "1978-10-27     NaN\n",
       "1978-10-28  10.420\n",
       "1978-10-29     NaN\n",
       "1978-10-30  10.557\n",
       "1978-10-31     NaN\n",
       "1978-11-01  10.670\n",
       "1978-11-02     NaN\n",
       "1978-11-03  10.787"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "north = north.align(ts, axis=0, join='right')[0]\n",
    "north.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "you can see we've added regular timesteps with missing data to our northern timeseries.\n",
    "and we can recompute a correct 5 day average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "north['5day-Avg'] = pd.rolling_mean(north['extent'], window=5, min_periods=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>extent</th>\n",
       "      <th>5day-Avg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1978-10-25</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-26</th>\n",
       "      <td>10.231</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-27</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-28</th>\n",
       "      <td>10.420</td>\n",
       "      <td>10.325500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-29</th>\n",
       "      <td>NaN</td>\n",
       "      <td>10.325500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-30</th>\n",
       "      <td>10.557</td>\n",
       "      <td>10.402667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-31</th>\n",
       "      <td>NaN</td>\n",
       "      <td>10.488500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-01</th>\n",
       "      <td>10.670</td>\n",
       "      <td>10.549000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-02</th>\n",
       "      <td>NaN</td>\n",
       "      <td>10.613500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-03</th>\n",
       "      <td>10.787</td>\n",
       "      <td>10.671333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            extent   5day-Avg\n",
       "1978-10-25     NaN        NaN\n",
       "1978-10-26  10.231        NaN\n",
       "1978-10-27     NaN        NaN\n",
       "1978-10-28  10.420  10.325500\n",
       "1978-10-29     NaN  10.325500\n",
       "1978-10-30  10.557  10.402667\n",
       "1978-10-31     NaN  10.488500\n",
       "1978-11-01  10.670  10.549000\n",
       "1978-11-02     NaN  10.613500\n",
       "1978-11-03  10.787  10.671333"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "north.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "***\n",
    "and you can see now that our 5 day average for 1978-11-03 is correct"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "avg(10.557, NaN, 10.670,  NaN,  10.787 ) => 10.671333"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10.671333333333335"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean([10.557, 10.670,  10.787])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "if you don't like the missing days getting filled, in require another day in your min_period."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>extent</th>\n",
       "      <th>5day-Avg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1978-10-25</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-26</th>\n",
       "      <td>10.231</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-27</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-28</th>\n",
       "      <td>10.420</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-29</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-30</th>\n",
       "      <td>10.557</td>\n",
       "      <td>10.402667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-10-31</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-01</th>\n",
       "      <td>10.670</td>\n",
       "      <td>10.549000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-02</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1978-11-03</th>\n",
       "      <td>10.787</td>\n",
       "      <td>10.671333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            extent   5day-Avg\n",
       "1978-10-25     NaN        NaN\n",
       "1978-10-26  10.231        NaN\n",
       "1978-10-27     NaN        NaN\n",
       "1978-10-28  10.420        NaN\n",
       "1978-10-29     NaN        NaN\n",
       "1978-10-30  10.557  10.402667\n",
       "1978-10-31     NaN        NaN\n",
       "1978-11-01  10.670  10.549000\n",
       "1978-11-02     NaN        NaN\n",
       "1978-11-03  10.787  10.671333"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "north['5day-Avg'] = pd.rolling_mean(north['extent'], window=5, min_periods=3)\n",
    "north.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  },
  "name": "Demonstrate_Rolling_Average.ipynb"
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"# Demonstrate rolling average for NSIDC timeseries data\n",
	"\n",
	"Given a timeseries with sometimes irregular or variable timeperiods between data points,\n",
	"compute an N day rolling mean, where M values must be observed to have a valid data point.\n",
	"\n",
	"N = number of days to average over\n",
	"M = number of valid data observations to get data\n",
	"\n",
	"The input data format is just a date and extent for each day we have data.\n",
	"```\n",
	"Year, Month, Day, Extent, Missing, Source Data\n",
	"YYYY, MM, DD, 10^6 sq km, 10^6 sq km, Source data product web site: http://nsidc.org/d....\n",
	"1978, 10, 26, 10.231, 0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....\n",
	"1978, 10, 28, 10.420, 0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....\n",
	"1978, 10, 30, 10.557, 0.000, ftp://sidads.colorado.edu/pub/DATASETS/nsidc0051....\n",
	"....\n",
	"```\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Populating the interactive namespace from numpy and matplotlib\n"
	]
	}
	],
	"source": [
	"import datetime as dt\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import os\n",
	"%pylab inline\n",
	"import pandas as pd\n",
	"pd.options.display.mpl_style = 'default'\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"code to read the CSV files."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"\n",
	"def parse_the_date(year, mm, dd):\n",
	" return dt.date(int(year), int(mm), int(dd))\n",
	"\n",
	"def slurp_csv(filename):\n",
	" data = pd.read_csv(filename, header = None, skiprows=2,\n",
	" names=[\"year\", \"mm\", \"dd\", \"extent\", \"missing\", \"source\"],\n",
	" parse_dates={'date':['year', 'mm', 'dd']},\n",
	" date_parser=parse_the_date, index_col='date')\n",
	" data = data.drop('missing', axis=1)\n",
	" return data\n",
	"\n",
	"def read_a_hemisphere(hemisphere):\n",
	" production_topdir = '/projects/DATASETS/NOAA/G02135/'\n",
	" final_prod_filename = os.path.join(production_topdir, '{hemisphere}/daily/data/{hemi}H_seaice_extent_final.csv'.format(hemisphere=hemisphere, hemi=hemisphere[0:1].upper()))\n",
	" nrt_prod_filename = os.path.join(production_topdir, '{hemisphere}/daily/data/{hemi}H_seaice_extent_nrt.csv'.format(hemisphere=hemisphere, hemi=hemisphere[0:1].upper()))\n",
	"\n",
	" final = slurp_csv(final_prod_filename)\n",
	" nrt = slurp_csv(nrt_prod_filename)\n",
	" all_data = pd.concat([final, nrt])\n",
	" return all_data\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"Read CSV data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"north = read_a_hemisphere('north')\n",
	"north.index = north.index.to_datetime()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<class 'pandas.tseries.index.DatetimeIndex'>\n",
	"[1978-10-26, ..., 2015-04-24]\n",
	"Length: 11700, Freq: None, Timezone: None"
	]
	},
	"execution_count": 38,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"north.index"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"north = north.drop('source',axis=1)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"Now we just have Extent and a datetime index"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>extent</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1978-10-26</th>\n",
	" <td>10.231</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-28</th>\n",
	" <td>10.420</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-30</th>\n",
	" <td>10.557</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-01</th>\n",
	" <td>10.670</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-03</th>\n",
	" <td>10.787</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" extent\n",
	"1978-10-26 10.231\n",
	"1978-10-28 10.420\n",
	"1978-10-30 10.557\n",
	"1978-11-01 10.670\n",
	"1978-11-03 10.787"
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"north.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"We could simply do a rolling average here with a number of observations:\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>extent</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1978-10-26</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-28</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-30</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-01</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-03</th>\n",
	" <td>10.533</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" extent\n",
	"1978-10-26 NaN\n",
	"1978-10-28 NaN\n",
	"1978-10-30 NaN\n",
	"1978-11-01 NaN\n",
	"1978-11-03 10.533"
	]
	},
	"execution_count": 41,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.rolling_mean(north, window=5).head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"And we can relax the number of data points needed to have a valid mean by changing min_periods"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>extent</th>\n",
	" <th>5day averaged</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1978-10-26</th>\n",
	" <td>10.231</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-28</th>\n",
	" <td>10.420</td>\n",
	" <td>10.325500</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-30</th>\n",
	" <td>10.557</td>\n",
	" <td>10.402667</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-01</th>\n",
	" <td>10.670</td>\n",
	" <td>10.469500</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-03</th>\n",
	" <td>10.787</td>\n",
	" <td>10.533000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" extent 5day averaged\n",
	"1978-10-26 10.231 NaN\n",
	"1978-10-28 10.420 10.325500\n",
	"1978-10-30 10.557 10.402667\n",
	"1978-11-01 10.670 10.469500\n",
	"1978-11-03 10.787 10.533000"
	]
	},
	"execution_count": 42,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"north['5day averaged'] = pd.rolling_mean(north, window=5, min_periods=2)\n",
	"north.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"But what we see is that the window and period in the `rolling_mean` function,\n",
	"doesn't account for actual days, but rather just observations.\n",
	"\n",
	"so the 5day average for 1978-11-03 is incorrectly given as:\n",
	"\n",
	" ( 10.231 + 10.420 + 10.557 + 10.670 + 10.787) / 5 = 10.533"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10.533"
	]
	},
	"execution_count": 43,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"( 10.231 + 10.420 + 10.557 + 10.670 + 10.787) / 5"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"We really should be just averaging 5 Days and ignoring the two days that the sensor was turned off:\n",
	"\n",
	" 1978-10-30 10.557\n",
	" 1978-10-31 missing\n",
	" 1978-11-01 10.670\n",
	" 1978-11-02 missing\n",
	" 1978-11-03 10.787\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10.671333333333335"
	]
	},
	"execution_count": 44,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"(10.557 + 10.670 + 10.787) / 3\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"To fix this we need to set up a regularly spaced datetimeindex for the extent data."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# get rid of the incorrectly averaged data\n",
	"north = north.drop('5day averaged', axis=1)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"create a fake timeseries to align the missing data in the SMMR ERA"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"ts = pd.Series(NaN, index=pd.date_range('10/25/1978', '4/20/2015'))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---\n",
	"Now align the existing extents with the new regularly spaced data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>extent</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1978-10-25</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-26</th>\n",
	" <td>10.231</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-27</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-28</th>\n",
	" <td>10.420</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-29</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-30</th>\n",
	" <td>10.557</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-31</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-01</th>\n",
	" <td>10.670</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-02</th>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-03</th>\n",
	" <td>10.787</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" extent\n",
	"1978-10-25 NaN\n",
	"1978-10-26 10.231\n",
	"1978-10-27 NaN\n",
	"1978-10-28 10.420\n",
	"1978-10-29 NaN\n",
	"1978-10-30 10.557\n",
	"1978-10-31 NaN\n",
	"1978-11-01 10.670\n",
	"1978-11-02 NaN\n",
	"1978-11-03 10.787"
	]
	},
	"execution_count": 47,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"north = north.align(ts, axis=0, join='right')[0]\n",
	"north.head(10)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"you can see we've added regular timesteps with missing data to our northern timeseries.\n",
	"and we can recompute a correct 5 day average"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"north['5day-Avg'] = pd.rolling_mean(north['extent'], window=5, min_periods=2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>extent</th>\n",
	" <th>5day-Avg</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1978-10-25</th>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-26</th>\n",
	" <td>10.231</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-27</th>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-28</th>\n",
	" <td>10.420</td>\n",
	" <td>10.325500</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-29</th>\n",
	" <td>NaN</td>\n",
	" <td>10.325500</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-30</th>\n",
	" <td>10.557</td>\n",
	" <td>10.402667</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-31</th>\n",
	" <td>NaN</td>\n",
	" <td>10.488500</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-01</th>\n",
	" <td>10.670</td>\n",
	" <td>10.549000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-02</th>\n",
	" <td>NaN</td>\n",
	" <td>10.613500</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-03</th>\n",
	" <td>10.787</td>\n",
	" <td>10.671333</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" extent 5day-Avg\n",
	"1978-10-25 NaN NaN\n",
	"1978-10-26 10.231 NaN\n",
	"1978-10-27 NaN NaN\n",
	"1978-10-28 10.420 10.325500\n",
	"1978-10-29 NaN 10.325500\n",
	"1978-10-30 10.557 10.402667\n",
	"1978-10-31 NaN 10.488500\n",
	"1978-11-01 10.670 10.549000\n",
	"1978-11-02 NaN 10.613500\n",
	"1978-11-03 10.787 10.671333"
	]
	},
	"execution_count": 49,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"north.head(10)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"***\n",
	"and you can see now that our 5 day average for 1978-11-03 is correct"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"avg(10.557, NaN, 10.670, NaN, 10.787 ) => 10.671333"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10.671333333333335"
	]
	},
	"execution_count": 50,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"np.mean([10.557, 10.670, 10.787])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"if you don't like the missing days getting filled, in require another day in your min_period."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>extent</th>\n",
	" <th>5day-Avg</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1978-10-25</th>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-26</th>\n",
	" <td>10.231</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-27</th>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-28</th>\n",
	" <td>10.420</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-29</th>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-30</th>\n",
	" <td>10.557</td>\n",
	" <td>10.402667</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-10-31</th>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-01</th>\n",
	" <td>10.670</td>\n",
	" <td>10.549000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-02</th>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1978-11-03</th>\n",
	" <td>10.787</td>\n",
	" <td>10.671333</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" extent 5day-Avg\n",
	"1978-10-25 NaN NaN\n",
	"1978-10-26 10.231 NaN\n",
	"1978-10-27 NaN NaN\n",
	"1978-10-28 10.420 NaN\n",
	"1978-10-29 NaN NaN\n",
	"1978-10-30 10.557 10.402667\n",
	"1978-10-31 NaN NaN\n",
	"1978-11-01 10.670 10.549000\n",
	"1978-11-02 NaN NaN\n",
	"1978-11-03 10.787 10.671333"
	]
	},
	"execution_count": 51,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"north['5day-Avg'] = pd.rolling_mean(north['extent'], window=5, min_periods=3)\n",
	"north.head(10)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	},
	"name": "Demonstrate_Rolling_Average.ipynb"
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}