Created
June 17, 2016 15:23
-
-
Save TDahlberg/98ef94dd461a91c1160325e962935365 to your computer and use it in GitHub Desktop.
CHF Data Cleaning notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas\n", | |
"import re\n", | |
"## Change the path to reflect the EPA data file location to be read in\n", | |
"filepath = '/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all.csv'\n", | |
"data = pandas.read_csv(filepath, low_memory=False)\n", | |
"print(list(data))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# This removes columns and keeps only those in the list below\n", | |
"columnstokeep = ['Latitude','Longitude','Parameter Name','Sample Duration','Pollutant Standard','Year','Units of Measure',\n", | |
" 'Primary Exceedance Count','Arithmetic Mean','1st Max DateTime','Address','State Name','County Name',\n", | |
" 'City Name']\n", | |
"less_columns = pandas.DataFrame(data,columns = columnstokeep)\n", | |
"print(list(less_columns))\n", | |
"print(len(less_columns.index))\n", | |
"print(less_columns['Parameter Name'].unique())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"## This section keeps only those records with criteria pollutants that match those below using regular expressions\n", | |
"parameter = ['[Ss]ulfur [Dd]ioxide','[Nn]itrogen [Dd]ioxide','[Ll]ead','[Pp][Mm]','[Oo]zone',\n", | |
" '[Cc]arbon [Mm]onoxide']\n", | |
"\n", | |
"filter_by_parameter = less_columns[less_columns['Parameter Name'].str.contains('|'.join(parameter),flags=re.IGNORECASE, regex=True)]\n", | |
"print(len(filter_by_parameter.index))\n", | |
"print(filter_by_parameter['Parameter Name'].unique())\n", | |
"print(filter_by_parameter['Sample Duration'].unique())\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Filter to only the times that match below\n", | |
"time = ['24 HOUR','24-HR BLK AVG','1 HOUR']\n", | |
"\n", | |
"filter_by_time = filter_by_parameter.loc[filter_by_parameter['Sample Duration'].isin(time)]\n", | |
"print(len(filter_by_time.index))\n", | |
"print(filter_by_time['Parameter Name'].unique())\n", | |
"print(filter_by_time['Sample Duration'].unique())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Write out the resulting CSV\n", | |
"## Change the path to the filename you want to write out\n", | |
"outpath = \"/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all_clean.csv\"\n", | |
"filter_by_time.to_csv(outpath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment