Last active
December 15, 2015 19:49
-
-
Save phobson/5314570 to your computer and use it in GitHub Desktop.
Units standardization using pandas. View it here: http://nbviewer.ipython.org/5314570
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "units conversion" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Import and units look-up dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas\n", | |
"import StringIO\n", | |
"units_map = {\n", | |
" 'ug/L': 1e-6,\n", | |
" 'mg/L': 1e-3,\n", | |
" 'g/L' : 1e+0,\n", | |
"}" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Mock up some data, define the target units" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data_csv = StringIO.StringIO(\"\"\"\\\n", | |
"storm,param,station,units,conc\n", | |
"1,lead,inflow,ug/L,10\n", | |
"2,lead,inflow,mg/L,0.02\n", | |
"3,lead,inflow,g/L,0.00003\n", | |
"4,lead,inflow,ug/L,40\n", | |
"1,lead,outflow,mg/L,0.05\n", | |
"2,lead,outflow,ug/L,60\n", | |
"3,lead,outflow,ug/L,70\n", | |
"4,lead,outflow,g/L,0.00008\"\"\")\n", | |
"data = pandas.read_csv(data_csv)\n", | |
"\n", | |
"target_units = 'ug/L'\n", | |
"data" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>storm</th>\n", | |
" <th>param</th>\n", | |
" <th>station</th>\n", | |
" <th>units</th>\n", | |
" <th>conc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 10.00000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> mg/L</td>\n", | |
" <td> 0.02000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> g/L</td>\n", | |
" <td> 0.00003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 4</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 40.00000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 1</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> mg/L</td>\n", | |
" <td> 0.05000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 2</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 60.00000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td> 3</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 70.00000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td> 4</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> g/L</td>\n", | |
" <td> 0.00008</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
" storm param station units conc\n", | |
"0 1 lead inflow ug/L 10.00000\n", | |
"1 2 lead inflow mg/L 0.02000\n", | |
"2 3 lead inflow g/L 0.00003\n", | |
"3 4 lead inflow ug/L 40.00000\n", | |
"4 1 lead outflow mg/L 0.05000\n", | |
"5 2 lead outflow ug/L 60.00000\n", | |
"6 3 lead outflow ug/L 70.00000\n", | |
"7 4 lead outflow g/L 0.00008" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Compute factors needed to normalize the units\n", | |
"`get` is a function/method of dictaries that returns the value based on the key. Here we're mapping that function to all of the records in `data['unit']`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"normalziation = data['units'].map(units_map.get) \n", | |
"normalziation" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"0 0.000001\n", | |
"1 0.001000\n", | |
"2 1.000000\n", | |
"3 0.000001\n", | |
"4 0.001000\n", | |
"5 0.000001\n", | |
"6 0.000001\n", | |
"7 1.000000\n", | |
"Name: units" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Normalize and convert to ug/L in one operation and update the `units` column too." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data['conc'] = data['conc'] * normalziation/units_map[target_units]\n", | |
"data['units'] = target_units\n", | |
"data" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>storm</th>\n", | |
" <th>param</th>\n", | |
" <th>station</th>\n", | |
" <th>units</th>\n", | |
" <th>conc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 20</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 30</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 4</td>\n", | |
" <td> lead</td>\n", | |
" <td> inflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 1</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 2</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 60</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td> 3</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 70</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td> 4</td>\n", | |
" <td> lead</td>\n", | |
" <td> outflow</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 80</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
" storm param station units conc\n", | |
"0 1 lead inflow ug/L 10\n", | |
"1 2 lead inflow ug/L 20\n", | |
"2 3 lead inflow ug/L 30\n", | |
"3 4 lead inflow ug/L 40\n", | |
"4 1 lead outflow ug/L 50\n", | |
"5 2 lead outflow ug/L 60\n", | |
"6 3 lead outflow ug/L 70\n", | |
"7 4 lead outflow ug/L 80" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Now unstack, rename, and organize the table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# create an index\n", | |
"data.set_index(['storm', 'param', 'station'], inplace=True)\n", | |
"\n", | |
"# unstack/cross tab based on inflow/outflow\n", | |
"table = data.unstack(level='station')\n", | |
"\n", | |
"# rename the column level labels\n", | |
"table.columns.names = ['Quantity', 'Monitoring Station']\n", | |
"\n", | |
"# reorder the column levels\n", | |
"table.columns = table.columns.swaplevel(0,1)\n", | |
"\n", | |
"# order the actual columns\n", | |
"table = table[[('inflow', 'conc'), ('inflow', 'units'), ('outflow', 'conc'), ('outflow', 'units')]]\n", | |
"table" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>Monitoring Station</th>\n", | |
" <th colspan=\"2\" halign=\"left\">inflow</th>\n", | |
" <th colspan=\"2\" halign=\"left\">outflow</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>Quantity</th>\n", | |
" <th>conc</th>\n", | |
" <th>units</th>\n", | |
" <th>conc</th>\n", | |
" <th>units</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>storm</th>\n", | |
" <th>param</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <th>lead</th>\n", | |
" <td> 10</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 50</td>\n", | |
" <td> ug/L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <th>lead</th>\n", | |
" <td> 20</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 60</td>\n", | |
" <td> ug/L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <th>lead</th>\n", | |
" <td> 30</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 70</td>\n", | |
" <td> ug/L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <th>lead</th>\n", | |
" <td> 40</td>\n", | |
" <td> ug/L</td>\n", | |
" <td> 80</td>\n", | |
" <td> ug/L</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"Monitoring Station inflow inflow outflow outflow\n", | |
"Quantity conc units conc units\n", | |
"storm param \n", | |
"1 lead 10 ug/L 50 ug/L\n", | |
"2 lead 20 ug/L 60 ug/L\n", | |
"3 lead 30 ug/L 70 ug/L\n", | |
"4 lead 40 ug/L 80 ug/L" | |
] | |
} | |
], | |
"prompt_number": 13 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment