Skip to content

Instantly share code, notes, and snippets.

@TDahlberg
Created June 17, 2016 15:23
Show Gist options
  • Save TDahlberg/98ef94dd461a91c1160325e962935365 to your computer and use it in GitHub Desktop.
Save TDahlberg/98ef94dd461a91c1160325e962935365 to your computer and use it in GitHub Desktop.
CHF Data Cleaning notebook
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import pandas\n",
"import re\n",
"## Change the path to reflect the EPA data file location to be read in\n",
"filepath = '/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all.csv'\n",
"data = pandas.read_csv(filepath, low_memory=False)\n",
"print(list(data))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"# This removes columns and keeps only those in the list below\n",
"columnstokeep = ['Latitude','Longitude','Parameter Name','Sample Duration','Pollutant Standard','Year','Units of Measure',\n",
" 'Primary Exceedance Count','Arithmetic Mean','1st Max DateTime','Address','State Name','County Name',\n",
" 'City Name']\n",
"less_columns = pandas.DataFrame(data,columns = columnstokeep)\n",
"print(list(less_columns))\n",
"print(len(less_columns.index))\n",
"print(less_columns['Parameter Name'].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"## This section keeps only those records with criteria pollutants that match those below using regular expressions\n",
"parameter = ['[Ss]ulfur [Dd]ioxide','[Nn]itrogen [Dd]ioxide','[Ll]ead','[Pp][Mm]','[Oo]zone',\n",
" '[Cc]arbon [Mm]onoxide']\n",
"\n",
"filter_by_parameter = less_columns[less_columns['Parameter Name'].str.contains('|'.join(parameter),flags=re.IGNORECASE, regex=True)]\n",
"print(len(filter_by_parameter.index))\n",
"print(filter_by_parameter['Parameter Name'].unique())\n",
"print(filter_by_parameter['Sample Duration'].unique())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"# Filter to only the times that match below\n",
"time = ['24 HOUR','24-HR BLK AVG','1 HOUR']\n",
"\n",
"filter_by_time = filter_by_parameter.loc[filter_by_parameter['Sample Duration'].isin(time)]\n",
"print(len(filter_by_time.index))\n",
"print(filter_by_time['Parameter Name'].unique())\n",
"print(filter_by_time['Sample Duration'].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Write out the resulting CSV\n",
"## Change the path to the filename you want to write out\n",
"outpath = \"/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all_clean.csv\"\n",
"filter_by_time.to_csv(outpath)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment