Last active
December 19, 2015 19:28
-
-
Save phobson/6006004 to your computer and use it in GitHub Desktop.
Brief tutorial on selecting data in with pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Selection Tutorial" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Complex data selections with multiple valid values using Pandas\n", | |
"## \n", | |
"### Main concepts covered:\n", | |
"+ How to use the cross-section (`.xs`) method of a dataframe to select data where *N* seperate levels of the index are equal to *N* seperate values\n", | |
"+ How to use the `.select` method of a dataframe to select data where *N* levels of the index meet arbitrary criteria\n", | |
"\n", | |
"### First, let's load some fake data, and set an index\n", | |
"+ Read the data using the top-level `read_csv` function.\n", | |
"+ Define the index like we would a primary key in a database. \n", | |
"+ Also pass in a string representation of the values that indicated missing/NA data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import StringIO\n", | |
"import pandas\n", | |
"from IPython.display import HTML\n", | |
"\n", | |
"data_string = StringIO.StringIO(\"\"\"location,storm,station,parameter,result,qual\n", | |
"Airport,1,Inflow,Lead,4.1,J\n", | |
"Airport,1,Inflow,TSS,2.2,=\n", | |
"Airport,1,Outflow,Lead,1.3,=\n", | |
"Airport,1,Outflow,TSS,9.4,=\n", | |
"Airport,2,Inflow,Lead,2.5,=\n", | |
"Airport,2,Inflow,TSS,6.6,=\n", | |
"Airport,2,Outflow,Lead,1.7,=\n", | |
"Airport,2,Outflow,TSS,2.8,=\n", | |
"Airport,3,Inflow,Lead,1.9,J\n", | |
"Airport,3,Inflow,TSS,8.0,=\n", | |
"Airport,3,Outflow,Lead,1.1,=\n", | |
"Airport,3,Outflow,TSS,8.2,=\n", | |
"Airport,4,Inflow,Lead,-999,-999\n", | |
"Airport,4,Inflow,TSS,10.4,=\n", | |
"Airport,4,Outflow,Lead,1.5,=\n", | |
"Airport,4,Outflow,TSS,6.6,=\n", | |
"Airport,5,Inflow,Lead,-999,-999\n", | |
"Airport,5,Inflow,TSS,7.8,J\n", | |
"Airport,5,Outflow,Lead,1.9,J\n", | |
"Airport,5,Outflow,TSS,2.0,=\n", | |
"School,1,Inflow,Lead,1.1,=\n", | |
"School,1,Inflow,TSS,10.2,=\n", | |
"School,1,Outflow,Lead,1.3,=\n", | |
"School,1,Outflow,TSS,10.4,=\n", | |
"School,2,Inflow,Lead,1.5,J\n", | |
"School,2,Inflow,TSS,2.6,=\n", | |
"School,2,Outflow,Lead,2.7,=\n", | |
"School,2,Outflow,TSS,9.8,=\n", | |
"School,3,Inflow,Lead,1.9,=\n", | |
"School,3,Inflow,TSS,-999,=\n", | |
"School,3,Outflow,Lead,2.1,=\n", | |
"School,3,Outflow,TSS,4.2,=\n", | |
"Business,1,Inflow,Lead,-999,-999\n", | |
"Business,1,Inflow,TSS,7.4,=\n", | |
"Business,1,Outflow,Lead,1.5,=\n", | |
"Business,1,Outflow,TSS,9.6,=\n", | |
"Business,2,Inflow,Lead,4.7,J\n", | |
"Business,2,Inflow,TSS,8.8,=\n", | |
"Business,2,Outflow,Lead,3.9,J\n", | |
"Business,2,Outflow,TSS,3.0,=\n", | |
"Business,3,Inflow,Lead,1.1,J\n", | |
"Business,3,Inflow,TSS,5.2,=\n", | |
"Business,3,Outflow,Lead,2.3,J\n", | |
"Business,3,Outflow,TSS,5.4,=\n", | |
"\"\"\")\n", | |
"raw_data = pandas.read_csv(data_string, index_col=['location', 'storm', 'station', 'parameter'], na_values='-999')\n", | |
"HTML(raw_data.to_html())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>result</th>\n", | |
" <th>qual</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>location</th>\n", | |
" <th>storm</th>\n", | |
" <th>station</th>\n", | |
" <th>parameter</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"20\" valign=\"top\">Airport</th>\n", | |
" <th rowspan=\"4\" valign=\"top\">1</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 4.1</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 2.2</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.3</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 9.4</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">2</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 2.5</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 6.6</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.7</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 2.8</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">3</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 8.0</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.1</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 8.2</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">4</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> NaN</td>\n", | |
" <td> NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 10.4</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.5</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 6.6</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">5</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> NaN</td>\n", | |
" <td> NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 7.8</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 2.0</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"12\" valign=\"top\">School</th>\n", | |
" <th rowspan=\"4\" valign=\"top\">1</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.1</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 10.2</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.3</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 10.4</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">2</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.5</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 2.6</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 2.7</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 9.8</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">3</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> NaN</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 2.1</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 4.2</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"12\" valign=\"top\">Business</th>\n", | |
" <th rowspan=\"4\" valign=\"top\">1</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> NaN</td>\n", | |
" <td> NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 7.4</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.5</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 9.6</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">2</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 4.7</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 8.8</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 3.9</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 3.0</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">3</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">Inflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.1</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 5.2</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 2.3</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>TSS</th>\n", | |
" <td> 5.4</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 1, | |
"text": [ | |
"<IPython.core.display.HTML at 0x55808d0>" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Simple Demos\n", | |
"### Cross-sections (`.xs`)\n", | |
"The cross section method can apply single-equality criteria to any number of levels of an index at the same time (*and* criteria, but not *or*). \n", | |
"\n", | |
"These are very efficient and optimized in pandas and act as vectorized operation on the index\n", | |
"\n", | |
"It takes the form: \n", | |
"\n", | |
"`subset = dataframe.xs([val1, val2, ... , valN], level=[col1, col2, ..., colN])`\n", | |
"where `col1` through `colN` are valid index levels\n", | |
"\n", | |
"One thing to notice about taking cross sections is that the index levels against which you apply the criteria are dropped (since they are now a single, known value). This is not the case with selections using `.select()` (more on that later).\n", | |
"\n", | |
"### So for this, let's select all of the TSS data at the Business\n", | |
"\n", | |
"If we were speaking SQL, we'd say:\n", | |
"```\n", | |
"select * \n", | |
"from raw_data\n", | |
"where location = 'Business'\n", | |
" and parameter = 'TSS'\n", | |
"```\n", | |
"\n", | |
"Here it is in pandas:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"business_tss = raw_data.xs(['Business', 'TSS'], level=['location', 'parameter'])\n", | |
"business_tss" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>result</th>\n", | |
" <th>qual</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>storm</th>\n", | |
" <th>station</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">1</th>\n", | |
" <th>Inflow</th>\n", | |
" <td> 7.4</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Outflow</th>\n", | |
" <td> 9.6</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">2</th>\n", | |
" <th>Inflow</th>\n", | |
" <td> 8.8</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Outflow</th>\n", | |
" <td> 3.0</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">3</th>\n", | |
" <th>Inflow</th>\n", | |
" <td> 5.2</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Outflow</th>\n", | |
" <td> 5.4</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
" result qual\n", | |
"storm station \n", | |
"1 Inflow 7.4 =\n", | |
" Outflow 9.6 =\n", | |
"2 Inflow 8.8 =\n", | |
" Outflow 3.0 =\n", | |
"3 Inflow 5.2 =\n", | |
" Outflow 5.4 =" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Selection (`.select`)\n", | |
"Selections allow us to more arbitrarily apply criteria to the index. This is less effiecient because it has to loop through each row of the index and apply a user-defined seletion function to each row.\n", | |
"\n", | |
"To use a selection you have to define a function that will accept a row from the index, inspect it, and decide to either return `True` if you want the data or `False` if you don't.\n", | |
"\n", | |
"So let's pull out effluent lead data from either buisness or the airport when the storm was number 3 or higher and influent TSS data from the school. Again, in SQL we'd write:\n", | |
"```\n", | |
"select *\n", | |
"from raw_data\n", | |
"where (\n", | |
" location in ('aiport', 'business')\n", | |
"and storm >= 3\n", | |
"and station = 'Outflow'\n", | |
"and parameter = 'Lead'\n", | |
") or (\n", | |
" location = 'school'\n", | |
"and station = 'Outflow'\n", | |
"and parameter = 'TSS'\n", | |
")\n", | |
"```\n", | |
"\n", | |
"And in pandas..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def criteria(row):\n", | |
" if (row[0] in ['Airport', 'Business']) & (row[1] >= 3) & (row[2] == 'Outflow') & (row[3] == 'Lead'):\n", | |
" return True\n", | |
" elif (row[0] == 'School') & (row[2] == 'Inflow') & (row[3] == 'TSS'):\n", | |
" return True\n", | |
" else:\n", | |
" return False\n", | |
" \n", | |
"subset = raw_data.select(criteria)\n", | |
"subset " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>result</th>\n", | |
" <th>qual</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>location</th>\n", | |
" <th>storm</th>\n", | |
" <th>station</th>\n", | |
" <th>parameter</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">Airport</th>\n", | |
" <th>3</th>\n", | |
" <th>Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.1</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <th>Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.5</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <th>Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">School</th>\n", | |
" <th>1</th>\n", | |
" <th>Inflow</th>\n", | |
" <th>TSS</th>\n", | |
" <td> 10.2</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <th>Inflow</th>\n", | |
" <th>TSS</th>\n", | |
" <td> 2.6</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <th>Inflow</th>\n", | |
" <th>TSS</th>\n", | |
" <td> NaN</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Business</th>\n", | |
" <th>3</th>\n", | |
" <th>Outflow</th>\n", | |
" <th>Lead</th>\n", | |
" <td> 2.3</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
" result qual\n", | |
"location storm station parameter \n", | |
"Airport 3 Outflow Lead 1.1 =\n", | |
" 4 Outflow Lead 1.5 =\n", | |
" 5 Outflow Lead 1.9 J\n", | |
"School 1 Inflow TSS 10.2 =\n", | |
" 2 Inflow TSS 2.6 =\n", | |
" 3 Inflow TSS NaN =\n", | |
"Business 3 Outflow Lead 2.3 J" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Advanced Demo\n", | |
"### Premise:\n", | |
"#### We have:\n", | |
"Influent and effluent TSS and lead concentrations measured at 3 different location over the course of multiple storms.\n", | |
"#### We want:\n", | |
"Median influent lead concentration for each location with more than 3 valid datapoints\n", | |
"#### Approach\n", | |
"+ We *cross-section* the rows using the `.xs` method of the dataframe for the parameter and monitoring locations\n", | |
"+ We'll also drop the missing values b/c we don't want them with `.dropna`\n", | |
"+ Aggregate the data by the `location` column to get the number of results\n", | |
"+ Subset the data further by selecting the rows where the data aggregated on `location` has more than three results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# select out just the influent lead data\n", | |
"lead = raw_data.xs(['Lead', 'Inflow'], level=['parameter', 'station'], axis=0)\n", | |
"\n", | |
"# remove rows missing data\n", | |
"lead = lead.dropna()\n", | |
"\n", | |
"lead" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>result</th>\n", | |
" <th>qual</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>location</th>\n", | |
" <th>storm</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">Airport</th>\n", | |
" <th>1</th>\n", | |
" <td> 4.1</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 2.5</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">School</th>\n", | |
" <th>1</th>\n", | |
" <td> 1.1</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 1.5</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Business</th>\n", | |
" <th>2</th>\n", | |
" <td> 4.7</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1.1</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
" result qual\n", | |
"location storm \n", | |
"Airport 1 4.1 J\n", | |
" 2 2.5 =\n", | |
" 3 1.9 J\n", | |
"School 1 1.1 =\n", | |
" 2 1.5 J\n", | |
" 3 1.9 =\n", | |
"Business 2 4.7 J\n", | |
" 3 1.1 J" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Now we need to count how many valid results we have at each `location`\n", | |
"### Just like in a database, we use a `groupby` statement" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"all_lead_studies = lead.groupby(level='location').size()\n", | |
"\n", | |
"all_lead_studies" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"location\n", | |
"Airport 3\n", | |
"Business 2\n", | |
"School 3\n", | |
"dtype: int64" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Then we can select all the BMPs with more than 3 results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"good_lead_studies = all_lead_studies[all_lead_studies >= 3]\n", | |
"\n", | |
"good_lead_studies" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"location\n", | |
"Airport 3\n", | |
"School 3\n", | |
"dtype: int64" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Here is where the really fancy stuff happens. \n", | |
"With the `.xs` method, we can really efficiently (computer time) select out single values from an index. But what if we need more?\n", | |
"That's where the `.select` method comes in. `.select` takes in a function that looks at each row of the index, and based on the values at that row, returns either `True` or `False`. Where that function returns `True`, we get data back. Where that function function returns `False`, data are omitted.\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def is_bmp_valid(index_row):\n", | |
" if index_row[0] in good_lead_studies:\n", | |
" return True\n", | |
" else:\n", | |
" return False\n", | |
"\n", | |
"lead = lead.select(is_bmp_valid)\n", | |
"\n", | |
"## if you know what lambdas are, you can also do:\n", | |
"# lead = lead.select(lambda x: x[0] in good_lead_studies)\n", | |
"lead" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>result</th>\n", | |
" <th>qual</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>location</th>\n", | |
" <th>storm</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">Airport</th>\n", | |
" <th>1</th>\n", | |
" <td> 4.1</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 2.5</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">School</th>\n", | |
" <th>1</th>\n", | |
" <td> 1.1</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 1.5</td>\n", | |
" <td> J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1.9</td>\n", | |
" <td> =</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
" result qual\n", | |
"location storm \n", | |
"Airport 1 4.1 J\n", | |
" 2 2.5 =\n", | |
" 3 1.9 J\n", | |
"School 1 1.1 =\n", | |
" 2 1.5 J\n", | |
" 3 1.9 =" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Now we can compute the median of the whole dataset *en masse*" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"lead.median()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"result 1.9\n", | |
"dtype: float64" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Or look at each `location` individually" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"lead.groupby(level='location').median()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>result</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>location</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>Airport</th>\n", | |
" <td> 2.5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>School</th>\n", | |
" <td> 1.5</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
" result\n", | |
"location \n", | |
"Airport 2.5\n", | |
"School 1.5" | |
] | |
} | |
], | |
"prompt_number": 9 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment