TDahlberg · June 17, 2016 15:23
diff --git a/chf_datacleaning.ipynb b/chf_datacleaning.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "import pandas\n",
    "import re\n",
    "## Change the path to reflect the EPA data file location to be read in\n",
    "filepath = '/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all.csv'\n",
    "data = pandas.read_csv(filepath, low_memory=False)\n",
    "print(list(data))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "# This removes columns and keeps only those in the list below\n",
    "columnstokeep = ['Latitude','Longitude','Parameter Name','Sample Duration','Pollutant Standard','Year','Units of Measure',\n",
    "                'Primary Exceedance Count','Arithmetic Mean','1st Max DateTime','Address','State Name','County Name',\n",
    "                'City Name']\n",
    "less_columns = pandas.DataFrame(data,columns = columnstokeep)\n",
    "print(list(less_columns))\n",
    "print(len(less_columns.index))\n",
    "print(less_columns['Parameter Name'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "## This section keeps only those records with criteria pollutants that match those below using regular expressions\n",
    "parameter = ['[Ss]ulfur [Dd]ioxide','[Nn]itrogen [Dd]ioxide','[Ll]ead','[Pp][Mm]','[Oo]zone',\n",
    "             '[Cc]arbon [Mm]onoxide']\n",
    "\n",
    "filter_by_parameter = less_columns[less_columns['Parameter Name'].str.contains('|'.join(parameter),flags=re.IGNORECASE, regex=True)]\n",
    "print(len(filter_by_parameter.index))\n",
    "print(filter_by_parameter['Parameter Name'].unique())\n",
    "print(filter_by_parameter['Sample Duration'].unique())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Filter to only the times that match below\n",
    "time = ['24 HOUR','24-HR BLK AVG','1 HOUR']\n",
    "\n",
    "filter_by_time = filter_by_parameter.loc[filter_by_parameter['Sample Duration'].isin(time)]\n",
    "print(len(filter_by_time.index))\n",
    "print(filter_by_time['Parameter Name'].unique())\n",
    "print(filter_by_time['Sample Duration'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Write out the resulting CSV\n",
    "## Change the path to the filename you want to write out\n",
    "outpath = \"/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all_clean.csv\"\n",
    "filter_by_time.to_csv(outpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"slideshow": {
	"slide_type": "slide"
	}
	},
	"outputs": [],
	"source": [
	"import pandas\n",
	"import re\n",
	"## Change the path to reflect the EPA data file location to be read in\n",
	"filepath = '/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all.csv'\n",
	"data = pandas.read_csv(filepath, low_memory=False)\n",
	"print(list(data))\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"slideshow": {
	"slide_type": "slide"
	}
	},
	"outputs": [],
	"source": [
	"# This removes columns and keeps only those in the list below\n",
	"columnstokeep = ['Latitude','Longitude','Parameter Name','Sample Duration','Pollutant Standard','Year','Units of Measure',\n",
	" 'Primary Exceedance Count','Arithmetic Mean','1st Max DateTime','Address','State Name','County Name',\n",
	" 'City Name']\n",
	"less_columns = pandas.DataFrame(data,columns = columnstokeep)\n",
	"print(list(less_columns))\n",
	"print(len(less_columns.index))\n",
	"print(less_columns['Parameter Name'].unique())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"## This section keeps only those records with criteria pollutants that match those below using regular expressions\n",
	"parameter = ['[Ss]ulfur [Dd]ioxide','[Nn]itrogen [Dd]ioxide','[Ll]ead','[Pp][Mm]','[Oo]zone',\n",
	" '[Cc]arbon [Mm]onoxide']\n",
	"\n",
	"filter_by_parameter = less_columns[less_columns['Parameter Name'].str.contains('\|'.join(parameter),flags=re.IGNORECASE, regex=True)]\n",
	"print(len(filter_by_parameter.index))\n",
	"print(filter_by_parameter['Parameter Name'].unique())\n",
	"print(filter_by_parameter['Sample Duration'].unique())\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"# Filter to only the times that match below\n",
	"time = ['24 HOUR','24-HR BLK AVG','1 HOUR']\n",
	"\n",
	"filter_by_time = filter_by_parameter.loc[filter_by_parameter['Sample Duration'].isin(time)]\n",
	"print(len(filter_by_time.index))\n",
	"print(filter_by_time['Parameter Name'].unique())\n",
	"print(filter_by_time['Sample Duration'].unique())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Write out the resulting CSV\n",
	"## Change the path to the filename you want to write out\n",
	"outpath = \"/Users/tylerdahlberg/projects/internal/summerofmaps/chf/datacleaning/EPA_All/epa_all_clean.csv\"\n",
	"filter_by_time.to_csv(outpath)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}