phobson · December 19, 2015 19:28
diff --git a/selections.ipynb b/selections.ipynb
 {
 "metadata": {
  "name": "Selection Tutorial"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Complex data selections with multiple valid values using Pandas\n",
      "## \n",
      "### Main concepts covered:\n",
      "+ How to use the cross-section (`.xs`) method of a dataframe to select data where *N* seperate levels of the index are equal to *N* seperate values\n",
      "+ How to use the `.select` method of a dataframe to select data where *N* levels of the index meet arbitrary criteria\n",
      "\n",
      "### First, let's load some fake data, and set an index\n",
      "+ Read the data using the top-level `read_csv` function.\n",
      "+ Define the index like we would a primary key in a database. \n",
      "+ Also pass in a string representation of the values that indicated missing/NA data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import StringIO\n",
      "import pandas\n",
      "from IPython.display import HTML\n",
      "\n",
      "data_string = StringIO.StringIO(\"\"\"location,storm,station,parameter,result,qual\n",
      "Airport,1,Inflow,Lead,4.1,J\n",
      "Airport,1,Inflow,TSS,2.2,=\n",
      "Airport,1,Outflow,Lead,1.3,=\n",
      "Airport,1,Outflow,TSS,9.4,=\n",
      "Airport,2,Inflow,Lead,2.5,=\n",
      "Airport,2,Inflow,TSS,6.6,=\n",
      "Airport,2,Outflow,Lead,1.7,=\n",
      "Airport,2,Outflow,TSS,2.8,=\n",
      "Airport,3,Inflow,Lead,1.9,J\n",
      "Airport,3,Inflow,TSS,8.0,=\n",
      "Airport,3,Outflow,Lead,1.1,=\n",
      "Airport,3,Outflow,TSS,8.2,=\n",
      "Airport,4,Inflow,Lead,-999,-999\n",
      "Airport,4,Inflow,TSS,10.4,=\n",
      "Airport,4,Outflow,Lead,1.5,=\n",
      "Airport,4,Outflow,TSS,6.6,=\n",
      "Airport,5,Inflow,Lead,-999,-999\n",
      "Airport,5,Inflow,TSS,7.8,J\n",
      "Airport,5,Outflow,Lead,1.9,J\n",
      "Airport,5,Outflow,TSS,2.0,=\n",
      "School,1,Inflow,Lead,1.1,=\n",
      "School,1,Inflow,TSS,10.2,=\n",
      "School,1,Outflow,Lead,1.3,=\n",
      "School,1,Outflow,TSS,10.4,=\n",
      "School,2,Inflow,Lead,1.5,J\n",
      "School,2,Inflow,TSS,2.6,=\n",
      "School,2,Outflow,Lead,2.7,=\n",
      "School,2,Outflow,TSS,9.8,=\n",
      "School,3,Inflow,Lead,1.9,=\n",
      "School,3,Inflow,TSS,-999,=\n",
      "School,3,Outflow,Lead,2.1,=\n",
      "School,3,Outflow,TSS,4.2,=\n",
      "Business,1,Inflow,Lead,-999,-999\n",
      "Business,1,Inflow,TSS,7.4,=\n",
      "Business,1,Outflow,Lead,1.5,=\n",
      "Business,1,Outflow,TSS,9.6,=\n",
      "Business,2,Inflow,Lead,4.7,J\n",
      "Business,2,Inflow,TSS,8.8,=\n",
      "Business,2,Outflow,Lead,3.9,J\n",
      "Business,2,Outflow,TSS,3.0,=\n",
      "Business,3,Inflow,Lead,1.1,J\n",
      "Business,3,Inflow,TSS,5.2,=\n",
      "Business,3,Outflow,Lead,2.3,J\n",
      "Business,3,Outflow,TSS,5.4,=\n",
      "\"\"\")\n",
      "raw_data = pandas.read_csv(data_string, index_col=['location', 'storm', 'station', 'parameter'], na_values='-999')\n",
      "HTML(raw_data.to_html())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th>result</th>\n",
        "      <th>qual</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th>storm</th>\n",
        "      <th>station</th>\n",
        "      <th>parameter</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th rowspan=\"20\" valign=\"top\">Airport</th>\n",
        "      <th rowspan=\"4\" valign=\"top\">1</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  4.1</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  2.2</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.3</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  9.4</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">2</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  2.5</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  6.6</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.7</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  2.8</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">3</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.9</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  8.0</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.1</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  8.2</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">4</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  NaN</td>\n",
        "      <td> NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td> 10.4</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.5</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  6.6</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">5</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  NaN</td>\n",
        "      <td> NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  7.8</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.9</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  2.0</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"12\" valign=\"top\">School</th>\n",
        "      <th rowspan=\"4\" valign=\"top\">1</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.1</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td> 10.2</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.3</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td> 10.4</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">2</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.5</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  2.6</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  2.7</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  9.8</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">3</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.9</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  NaN</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  2.1</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  4.2</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"12\" valign=\"top\">Business</th>\n",
        "      <th rowspan=\"4\" valign=\"top\">1</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  NaN</td>\n",
        "      <td> NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  7.4</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.5</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  9.6</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">2</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  4.7</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  8.8</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  3.9</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  3.0</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"4\" valign=\"top\">3</th>\n",
        "      <th rowspan=\"2\" valign=\"top\">Inflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.1</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  5.2</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  2.3</td>\n",
        "      <td>   J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>TSS</th>\n",
        "      <td>  5.4</td>\n",
        "      <td>   =</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>"
       ],
       "output_type": "pyout",
       "prompt_number": 1,
       "text": [
        "<IPython.core.display.HTML at 0x55808d0>"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Simple Demos\n",
      "### Cross-sections (`.xs`)\n",
      "The cross section method can apply single-equality criteria to any number of levels of an index at the same time (*and* criteria, but not *or*). \n",
      "\n",
      "These are very efficient and optimized in pandas and act as vectorized operation on the index\n",
      "\n",
      "It takes the form: \n",
      "\n",
      "`subset = dataframe.xs([val1, val2, ... , valN], level=[col1, col2, ..., colN])`\n",
      "where `col1` through `colN` are valid index levels\n",
      "\n",
      "One thing to notice about taking cross sections is that the index levels against which you apply the criteria are dropped (since they are now a single, known value). This is not the case with selections using `.select()` (more on that later).\n",
      "\n",
      "### So for this, let's select all of the TSS data at the Business\n",
      "\n",
      "If we were speaking SQL, we'd say:\n",
      "```\n",
      "select * \n",
      "from raw_data\n",
      "where location = 'Business'\n",
      "  and parameter = 'TSS'\n",
      "```\n",
      "\n",
      "Here it is in pandas:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "business_tss = raw_data.xs(['Business', 'TSS'], level=['location', 'parameter'])\n",
      "business_tss"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th>result</th>\n",
        "      <th>qual</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>storm</th>\n",
        "      <th>station</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
        "      <th>Inflow</th>\n",
        "      <td> 7.4</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Outflow</th>\n",
        "      <td> 9.6</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">2</th>\n",
        "      <th>Inflow</th>\n",
        "      <td> 8.8</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Outflow</th>\n",
        "      <td> 3.0</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">3</th>\n",
        "      <th>Inflow</th>\n",
        "      <td> 5.2</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Outflow</th>\n",
        "      <td> 5.4</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "               result qual\n",
        "storm station             \n",
        "1     Inflow      7.4    =\n",
        "      Outflow     9.6    =\n",
        "2     Inflow      8.8    =\n",
        "      Outflow     3.0    =\n",
        "3     Inflow      5.2    =\n",
        "      Outflow     5.4    ="
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Selection (`.select`)\n",
      "Selections allow us to more arbitrarily apply criteria to the index. This is less effiecient because it has to loop through each row of the index and apply a user-defined seletion function to each row.\n",
      "\n",
      "To use a selection you have to define a function that will accept a row from the index, inspect it, and decide to either return `True` if you want the data or `False` if you don't.\n",
      "\n",
      "So let's pull out effluent lead data from either buisness or the airport when the storm was number 3 or higher and influent TSS data from the school. Again, in SQL we'd write:\n",
      "```\n",
      "select *\n",
      "from raw_data\n",
      "where (\n",
      "    location in ('aiport', 'business')\n",
      "and storm >= 3\n",
      "and station = 'Outflow'\n",
      "and parameter = 'Lead'\n",
      ") or (\n",
      "    location = 'school'\n",
      "and station = 'Outflow'\n",
      "and parameter = 'TSS'\n",
      ")\n",
      "```\n",
      "\n",
      "And in pandas..."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def criteria(row):\n",
      "    if (row[0] in ['Airport', 'Business']) & (row[1] >= 3) & (row[2] == 'Outflow') & (row[3] == 'Lead'):\n",
      "        return True\n",
      "    elif (row[0] == 'School') & (row[2] == 'Inflow') & (row[3] == 'TSS'):\n",
      "        return True\n",
      "    else:\n",
      "        return False\n",
      "    \n",
      "subset = raw_data.select(criteria)\n",
      "subset "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th>result</th>\n",
        "      <th>qual</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th>storm</th>\n",
        "      <th>station</th>\n",
        "      <th>parameter</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">Airport</th>\n",
        "      <th>3</th>\n",
        "      <th>Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.1</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <th>Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.5</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <th>Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  1.9</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">School</th>\n",
        "      <th>1</th>\n",
        "      <th>Inflow</th>\n",
        "      <th>TSS</th>\n",
        "      <td> 10.2</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <th>Inflow</th>\n",
        "      <th>TSS</th>\n",
        "      <td>  2.6</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <th>Inflow</th>\n",
        "      <th>TSS</th>\n",
        "      <td>  NaN</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Business</th>\n",
        "      <th>3</th>\n",
        "      <th>Outflow</th>\n",
        "      <th>Lead</th>\n",
        "      <td>  2.3</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "                                  result qual\n",
        "location storm station parameter             \n",
        "Airport  3     Outflow Lead          1.1    =\n",
        "         4     Outflow Lead          1.5    =\n",
        "         5     Outflow Lead          1.9    J\n",
        "School   1     Inflow  TSS          10.2    =\n",
        "         2     Inflow  TSS           2.6    =\n",
        "         3     Inflow  TSS           NaN    =\n",
        "Business 3     Outflow Lead          2.3    J"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Advanced Demo\n",
      "### Premise:\n",
      "#### We have:\n",
      "Influent and effluent TSS and lead concentrations measured at 3 different location over the course of multiple storms.\n",
      "#### We want:\n",
      "Median influent lead concentration for each location with more than 3 valid datapoints\n",
      "#### Approach\n",
      "+ We *cross-section* the rows using the `.xs` method of the dataframe for the parameter and monitoring locations\n",
      "+ We'll also drop the missing values b/c we don't want them with `.dropna`\n",
      "+ Aggregate the data by the `location` column to get the number of results\n",
      "+ Subset the data further by selecting the rows where the data aggregated on `location` has more than three results"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# select out just the influent lead data\n",
      "lead = raw_data.xs(['Lead', 'Inflow'], level=['parameter', 'station'], axis=0)\n",
      "\n",
      "# remove rows missing data\n",
      "lead = lead.dropna()\n",
      "\n",
      "lead"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th>result</th>\n",
        "      <th>qual</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th>storm</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">Airport</th>\n",
        "      <th>1</th>\n",
        "      <td> 4.1</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 2.5</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1.9</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">School</th>\n",
        "      <th>1</th>\n",
        "      <td> 1.1</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1.5</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1.9</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"2\" valign=\"top\">Business</th>\n",
        "      <th>2</th>\n",
        "      <td> 4.7</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1.1</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "                result qual\n",
        "location storm             \n",
        "Airport  1         4.1    J\n",
        "         2         2.5    =\n",
        "         3         1.9    J\n",
        "School   1         1.1    =\n",
        "         2         1.5    J\n",
        "         3         1.9    =\n",
        "Business 2         4.7    J\n",
        "         3         1.1    J"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Now we need to count how many valid results we have at each `location`\n",
      "### Just like in a database, we use a `groupby` statement"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "all_lead_studies = lead.groupby(level='location').size()\n",
      "\n",
      "all_lead_studies"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "location\n",
        "Airport     3\n",
        "Business    2\n",
        "School      3\n",
        "dtype: int64"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Then we can select all the BMPs with more than 3 results"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "good_lead_studies = all_lead_studies[all_lead_studies >= 3]\n",
      "\n",
      "good_lead_studies"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "location\n",
        "Airport     3\n",
        "School      3\n",
        "dtype: int64"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Here is where the really fancy stuff happens. \n",
      "With the `.xs` method, we can really efficiently (computer time) select out single values from an index. But what if we need more?\n",
      "That's where the `.select` method comes in. `.select` takes in a function that looks at each row of the index, and based on the values at that row, returns either `True` or `False`. Where that function returns `True`, we get data back. Where that function function returns `False`, data are omitted.\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def is_bmp_valid(index_row):\n",
      "    if index_row[0] in good_lead_studies:\n",
      "        return True\n",
      "    else:\n",
      "        return False\n",
      "\n",
      "lead = lead.select(is_bmp_valid)\n",
      "\n",
      "## if you know what lambdas are, you can also do:\n",
      "# lead = lead.select(lambda x: x[0] in good_lead_studies)\n",
      "lead"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th>result</th>\n",
        "      <th>qual</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th>storm</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">Airport</th>\n",
        "      <th>1</th>\n",
        "      <td> 4.1</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 2.5</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1.9</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">School</th>\n",
        "      <th>1</th>\n",
        "      <td> 1.1</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 1.5</td>\n",
        "      <td> J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 1.9</td>\n",
        "      <td> =</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "                result qual\n",
        "location storm             \n",
        "Airport  1         4.1    J\n",
        "         2         2.5    =\n",
        "         3         1.9    J\n",
        "School   1         1.1    =\n",
        "         2         1.5    J\n",
        "         3         1.9    ="
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Now we can compute the median of the whole dataset *en masse*"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "lead.median()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "result    1.9\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Or look at each `location` individually"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "lead.groupby(level='location').median()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>result</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>location</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>Airport</th>\n",
        "      <td> 2.5</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>School</th>\n",
        "      <td> 1.5</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "          result\n",
        "location        \n",
        "Airport      2.5\n",
        "School       1.5"
       ]
      }
     ],
     "prompt_number": 9
    }
   ],
   "metadata": {}
  }
 ]
 }