Created
August 16, 2013 00:31
-
-
Save phobson/6246234 to your computer and use it in GitHub Desktop.
huge data stack with pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Setup the environment and load data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas\n", | |
"from IPython.display import HTML\n", | |
"\n", | |
"# read data in as csv with three rows of column headers\n", | |
"df_cols = pandas.read_csv('nsqd/NSQD_xtab.csv', na_values=[-999], header=[0,1,2], index_col=range(38))\n", | |
"\n", | |
"# seperate out each row of column headers\n", | |
"df_cols.columns = pandas.MultiIndex.from_tuples(df_cols.columns)\n", | |
"\n", | |
"# name each column header\n", | |
"df_cols.columns.names = ['parameter', 'units', 'value']\n", | |
"print('Row Index: ')\n", | |
"print(df_cols.index.names)\n", | |
"print('\\n Column Levels:')\n", | |
"print(df_cols.columns.names)\n", | |
"print('\\n Columns:')\n", | |
"print(df_cols.columns.tolist())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Row Index: \n", | |
"['ORDER', 'Primary Landuse', 'Secondary Landuse', 'Season', 'Database', 'LOCATION_ID', 'Jurisdiction (County)', 'Jurisdiction (City)', 'Site_ID', 'Rain Zone', 'Contact', 'PLU_Residential', 'PLU_Institutional', 'PLU_Commercial', 'PLU_Industrial', 'PLU_Open_Space', 'PLU_Freeway', 'PLU_Water', 'PLU_UNK', 'Drainage_Area', 'Status', 'Latitude', 'Longitude', 'Per_Impervious', 'Q1', 'Runoff_Vol_Coef', 'Age_of_Development', 'Type_Conveyance', 'Controls', 'Comments', 'EVENT_ID_Conventional', 'Start Date', 'End Date', 'Maxr15', 'Event Type', 'Sample Timing', 'Sample Type', 'SaAnType']\n", | |
"\n", | |
" Column Levels:\n", | |
"['parameter', 'units', 'value']\n", | |
"\n", | |
" Columns:\n", | |
"[('Precipitation Depth', 'in', 'res'), ('Precipitation Depth', 'in', 'qual'), ('Runoff', 'in', 'res'), ('Runoff', 'in', 'qual'), ('Days since last rain', '--', 'res'), ('Days since last rain', '--', 'qual'), ('Conductivity', 'uS/cm @25 degC', 'res'), ('Conductivity', 'uS/cm @25 degC', 'qual'), ('DO', 'mg/L', 'res'), ('DO', 'mg/L', 'qual'), ('Hardness as CaCO3', 'mg/L', 'res'), ('Hardness as CaCO3', 'mg/L', 'qual'), ('Total Oil and Grease', 'mg/L', 'res'), ('Total Oil and Grease', 'mg/L', 'qual'), ('pH', '--', 'res'), ('pH', '--', 'qual'), ('Turbidity', 'NTU', 'res'), ('Turbidity', 'NTU', 'qual'), ('Temperature', 'degC', 'res'), ('Temperature', 'degC', 'qual'), ('Total Dissolved Solids', 'mg/L', 'res'), ('Total Dissolved Solids', 'mg/L', 'qual'), ('Total Solids', 'mg/L', 'res'), ('Total Solids', 'mg/L', 'qual'), ('Total Suspended Solids', 'mg/L', 'res'), ('Total Suspended Solids', 'mg/L', 'qual'), ('BOD5', 'mg/L', 'res'), ('BOD5', 'mg/L', 'qual'), ('COD', 'mg/L', 'res'), ('COD', 'mg/L', 'qual'), ('Fecal Coliform', 'CFU/100mL', 'res'), ('Fecal Coliform', 'CFU/100mL', 'qual'), ('Fecal Streptococcus', 'CFU/100mL', 'res'), ('Fecal Streptococcus', 'CFU/100mL', 'qual'), ('Total Coliform', 'CFU/100mL', 'res'), ('Total Coliform', 'CFU/100mL', 'qual'), ('E. Coli', 'CFU/100mL', 'res'), ('E. Coli', 'CFU/100mL', 'qual'), ('Ammonia', 'mg/L', 'res'), ('Ammonia', 'mg/L', 'qual'), ('Nitrate + Nitrite', 'mg/L', 'res'), ('Nitrate + Nitrite', 'mg/L', 'qual'), ('Total Nitrogen', 'mg/L', 'res'), ('Total Nitrogen', 'mg/L', 'qual'), ('Total Kjeldahl Nitrogen', 'mg/L', 'res'), ('Total Kjeldahl Nitrogen', 'mg/L', 'qual'), ('Orthophosphate', 'mg/L', 'res'), ('Orthophosphate', 'mg/L', 'qual'), ('Dissolved Phosphorus', 'mg/L', 'res'), ('Dissolved Phosphorus', 'mg/L', 'qual'), ('Total Phosphorus', 'mg/L', 'res'), ('Total Phosphorus', 'mg/L', 'qual'), ('Total Antimony', 'ug/L', 'res'), ('Total Antimony', 'ug/L', 'qual'), ('Dissolved Antimony', 'ug/L', 'res'), ('Dissolved Antimony', 'ug/L', 'qual'), ('Total Arsenic', 'ug/L', 'res'), ('Total Arsenic', 'ug/L', 'qual'), ('Dissolved Arsenic', 'ug/L', 'res'), ('Dissolved Arsenic', 'ug/L', 'qual'), ('Total Beryllium', 'ug/L', 'res'), ('Total Beryllium', 'ug/L', 'qual'), ('Dissolved Beryllium', 'ug/L', 'res'), ('Dissolved Beryllium', 'ug/L', 'qual'), ('Total Cadmium', 'ug/L', 'res'), ('Total Cadmium', 'ug/L', 'qual'), ('Dissolved Cadmium', 'ug/L', 'res'), ('Dissolved Cadmium', 'ug/L', 'qual'), ('Total Chromium', 'ug/L', 'res'), ('Total Chromium', 'ug/L', 'qual'), ('Dissolved Chromium', 'ug/L', 'res'), ('Dissolved Chromium', 'ug/L', 'qual'), ('Total Copper', 'ug/L', 'res'), ('Total Copper', 'ug/L', 'qual'), ('Dissolved Copper', 'ug/L', 'res'), ('Dissolved Copper', 'ug/L', 'qual'), ('Total Cyanide', 'ug/L', 'res'), ('Total Cyanide', 'ug/L', 'qual'), ('Dissolved Cyanide', 'ug/L', 'res'), ('Dissolved Cyanide', 'ug/L', 'qual'), ('Total Lead', 'ug/L', 'res'), ('Total Lead', 'ug/L', 'qual'), ('Dissolved Lead', 'ug/L', 'res'), ('Dissolved Lead', 'ug/L', 'qual'), ('Total Mercury', 'ug/L', 'res'), ('Total Mercury', 'ug/L', 'qual'), ('Dissolved Mercury', 'ug/L', 'res'), ('Dissolved Mercury', 'ug/L', 'qual'), ('Total Nickel', 'ug/L', 'res'), ('Total Nickel', 'ug/L', 'qual'), ('Dissolved Nickel', 'ug/L', 'res'), ('Dissolved Nickel', 'ug/L', 'qual'), ('Total Selenium', 'ug/L', 'res'), ('Total Selenium', 'ug/L', 'qual'), ('Dissolved Selenium', 'ug/L', 'res'), ('Dissolved Selenium', 'ug/L', 'qual'), ('Total Silver', 'ug/L', 'res'), ('Total Silver', 'ug/L', 'qual'), ('Dissolved Silver', 'ug/L', 'res'), ('Dissolved Silver', 'ug/L', 'qual'), ('Total Thallium', 'ug/L', 'res'), ('Total Thallium', 'ug/L', 'qual'), ('Dissolved Thallium', 'ug/L', 'res'), ('Dissolved Thallium', 'ug/L', 'qual'), ('Total Zinc', 'ug/L', 'res'), ('Total Zinc', 'ug/L', 'qual'), ('Dissolved Zinc', 'ug/L', 'res'), ('Dissolved Zinc', 'ug/L', 'qual'), ('Acrolein', 'ug/L', 'res'), ('Acrolein', 'ug/L', 'qual'), ('Acrylonitrile', 'ug/L', 'res'), ('Acrylonitrile', 'ug/L', 'qual'), ('Benzene', 'ug/L', 'res'), ('Benzene', 'ug/L', 'qual'), ('Bromoform', 'ug/L', 'res'), ('Bromoform', 'ug/L', 'qual'), ('Chlorobenzene', 'ug/L', 'res'), ('Chlorobenzene', 'ug/L', 'qual'), ('Chlorodibromomethane', 'ug/L', 'res'), ('Chlorodibromomethane', 'ug/L', 'qual'), ('Chloroethane', 'ug/L', 'res'), ('Chloroethane', 'ug/L', 'qual'), ('2-Chloroethylvinylether', 'ug/L', 'res'), ('2-Chloroethylvinylether', 'ug/L', 'qual'), ('Chloroform', 'ug/L', 'res'), ('Chloroform', 'ug/L', 'qual'), ('Dichlorobromoethane', 'ug/L', 'res'), ('Dichlorobromoethane', 'ug/L', 'qual'), ('1,1-Dichloroethane', 'ug/L', 'res'), ('1,1-Dichloroethane', 'ug/L', 'qual'), ('1,2-Dichloroethane', 'ug/L', 'res'), ('1,2-Dichloroethane', 'ug/L', 'qual'), ('1,1-Dichloroethylene', 'ug/L', 'res'), ('1,1-Dichloroethylene', 'ug/L', 'qual'), ('1,2-Dichloropropane', 'ug/L', 'res'), ('1,2-Dichloropropane', 'ug/L', 'qual'), ('1,3-Dichloropropylene', 'ug/L', 'res'), ('1,3-Dichloropropylene', 'ug/L', 'qual'), ('Ethylbenzene', 'ug/L', 'res'), ('Ethylbenzene', 'ug/L', 'qual'), ('Methylbromide', 'ug/L', 'res'), ('Methylbromide', 'ug/L', 'qual'), ('Methylchloride', 'ug/L', 'res'), ('Methylchloride', 'ug/L', 'qual'), ('Methylenechloride', 'ug/L', 'res'), ('Methylenechloride', 'ug/L', 'qual'), ('1,1,2,2-Tetrachloroethane', 'ug/L', 'res'), ('1,1,2,2-Tetrachloroethane', 'ug/L', 'qual'), ('Tetrachloroethylene', 'ug/L', 'res'), ('Tetrachloroethylene', 'ug/L', 'qual'), ('Toluene', 'ug/L', 'res'), ('Toluene', 'ug/L', 'qual'), ('1,2-Trans-Dichloroetylene', 'ug/L', 'res'), ('1,2-Trans-Dichloroetylene', 'ug/L', 'qual'), ('1,1,1-Trichloroethane', 'ug/L', 'res'), ('1,1,1-Trichloroethane', 'ug/L', 'qual'), ('1,1,2-Trichloroethane', 'ug/L', 'res'), ('1,1,2-Trichloroethane', 'ug/L', 'qual'), ('Trichloroethylene', 'ug/L', 'res'), ('Trichloroethylene', 'ug/L', 'qual'), ('Vinylchloride', 'ug/L', 'res'), ('Vinylchloride', 'ug/L', 'qual'), ('Total Alkalinity', 'mg/L', 'res'), ('Total Alkalinity', 'mg/L', 'qual'), ('Coliform/Fecal Fecal', '--', 'res'), ('Coliform/Fecal Fecal', '--', 'qual'), ('Oil and Grease Hydrocarbons', 'mg/L', 'res'), ('Oil and Grease Hydrocarbons', 'mg/L', 'qual'), ('Total Hydrocarbon Fingerprint', 'mg/L', 'res'), ('Total Hydrocarbon Fingerprint', 'mg/L', 'qual'), ('Total Petroleum Hydrocarbon', 'mg/L', 'res'), ('Total Petroleum Hydrocarbon', 'mg/L', 'qual'), ('Total Organic Carbon', 'mg/L', 'res'), ('Total Organic Carbon', 'mg/L', 'qual'), ('Dissolved Chloride', 'mg/L', 'res'), ('Dissolved Chloride', 'mg/L', 'qual'), ('Trans-1,3-Dichloropropene', 'ug/L', 'res'), ('Trans-1,3-Dichloropropene', 'ug/L', 'qual'), ('Bromomethane', 'ug/L', 'res'), ('Bromomethane', 'ug/L', 'qual'), ('Chloromethane', 'ug/L', 'res'), ('Chloromethane', 'ug/L', 'qual'), ('Trichlorofluoromethane', 'ug/L', 'res'), ('Trichlorofluoromethane', 'ug/L', 'qual'), ('Nitrate as N', 'mg/L', 'res'), ('Nitrate as N', 'mg/L', 'qual'), ('Nitrite as N', 'mg/L', 'res'), ('Nitrite as N', 'mg/L', 'qual'), ('Total Organic Nitrogen', 'mg/L', 'res'), ('Total Organic Nitrogen', 'mg/L', 'qual'), ('Total Barium', 'ug/L', 'res'), ('Total Barium', 'ug/L', 'qual'), ('Total Iron', 'ug/L', 'res'), ('Total Iron', 'ug/L', 'qual'), ('Dissolved Iron', 'ug/L', 'res'), ('Dissolved Iron', 'ug/L', 'qual')]\n" | |
] | |
} | |
], | |
"prompt_number": 98 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Convert data from x-tab to DB format and export" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# stack the data into a row-based format on the \n", | |
"# 'parameter' & 'units' levels, dropping all rows\n", | |
"# where the res *and* qual are NA\n", | |
"df_rows = df_cols.stack(level=0, dropna=True) \\\n", | |
" .stack(level=0, dropna=False) \\\n", | |
" .dropna(how='all')\n", | |
"\n", | |
"# dump to csv\n", | |
"df_rows.to_csv('nsqd/nsqd_20130815.csv', na_rep='--')\n", | |
"print('Row Index: ')\n", | |
"print(df_rows.index.names)\n", | |
"print('\\n Columns:')\n", | |
"print(df_rows.columns.tolist())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Row Index: \n", | |
"['ORDER', 'Primary Landuse', 'Secondary Landuse', 'Season', 'Database', 'LOCATION_ID', 'Jurisdiction (County)', 'Jurisdiction (City)', 'Site_ID', 'Rain Zone', 'Contact', 'PLU_Residential', 'PLU_Institutional', 'PLU_Commercial', 'PLU_Industrial', 'PLU_Open_Space', 'PLU_Freeway', 'PLU_Water', 'PLU_UNK', 'Drainage_Area', 'Status', 'Latitude', 'Longitude', 'Per_Impervious', 'Q1', 'Runoff_Vol_Coef', 'Age_of_Development', 'Type_Conveyance', 'Controls', 'Comments', 'EVENT_ID_Conventional', 'Start Date', 'End Date', 'Maxr15', 'Event Type', 'Sample Timing', 'Sample Type', 'SaAnType', 'parameter', 'units']\n", | |
"\n", | |
" Columns:\n", | |
"['qual', 'res']\n" | |
] | |
} | |
], | |
"prompt_number": 97 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Count the results in Raleigh, NC" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"raleigh = df.select(lambda x: 'Raleigh' in x[7])\n", | |
"raleigh_params = raleigh.groupby(level='parameter')\n", | |
"raleigh_params.size()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 82, | |
"text": [ | |
"parameter\n", | |
"BOD5 61\n", | |
"COD 61\n", | |
"Days since last rain 63\n", | |
"Dissolved Phosphorus 60\n", | |
"Nitrate + Nitrite 50\n", | |
"Precipitation Depth 63\n", | |
"Total Cadmium 63\n", | |
"Total Copper 63\n", | |
"Total Dissolved Solids 62\n", | |
"Total Kjeldahl Nitrogen 62\n", | |
"Total Lead 63\n", | |
"Total Nitrogen 61\n", | |
"Total Phosphorus 61\n", | |
"Total Suspended Solids 62\n", | |
"Total Zinc 63\n", | |
"pH 13\n", | |
"dtype: int64" | |
] | |
} | |
], | |
"prompt_number": 82 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment