phobson · August 23, 2014 01:58
diff --git a/tidy_scratch.py b/tidy_scratch.py
 %matplotlib inline
 import numpy as np
 import pandas
 import matplotlib.pyplot as plt

 import seaborn
 seaborn.set(style='ticks', palette='deep', rc={'text.usetex': False})

 import bmp
 import utils

 pollutants = ['Cadmium, Total', 'Copper, Total', 'Lead, Total', 'Zinc, Total']
 BMPs = ['Bioretention', 'Detention Basin', 'Retention Pond', 'Wetland Basin']

 # load data
 database = bmp.Database("bmp/data/everything.csv", bmpcat_src="bmp/data/bmpcats.csv")

 # select the subset of data we care about
 # (data come x-tabbed with influent/effluent in the columns. The `stack`
 # command moves the data back into rows)
 metals = database.selectData(parameter=pollutants, category=BMPs)

 def getROS(group):
    mr = utils.ros.MR(group)
    return mr.data

 class DatasetCollection(object):
    def __init__(self, dataframe, rescol='res', qualcol='qual',
                 stationcol='station', paramcol='parameter',
                 othergroups=None, useROS=True):
        self.data = dataframe
        self._raw_rescol = rescol
        
        roscol = 'ros_' + rescol
        if useROS:
            self.rescol = roscol
        else:
            self.rescol = rescol
    
        self.qualcol = qualcol
        self.stationcol = stationcol
        self.paramcol = paramcol

        if othergroups is None:
            othergroups = []

        self.groupby = [stationcol, paramcol] + list([othergroups])
        self.columns = self.groupby + [self._raw_rescol, self.qualcol]

        self.tidy = self.data \
                        .stack(level=self.stationcol) \
                        .reset_index()[self.columns] \
                        .groupby(by=self.groupby) \
                        .apply(getROS) \
                        .reset_index() \
                        .rename(columns={'final_data': roscol})

        for c in self.tidy.columns:
            if c not in self.columns + [roscol]:
                self.tidy = self.tidy \
                                .sort(columns=c) \
                                .reset_index() \
                                .drop([c, 'index'], axis=1)
                                    
 dc = DatasetCollection(metals, othergroups='category')
 dc.tidy[(dc.tidy.parameter=='Zinc, Total') & (dc.tidy.category == 'Bioretention')].head(10)
	%matplotlib inline
	import numpy as np
	import pandas
	import matplotlib.pyplot as plt

	import seaborn
	seaborn.set(style='ticks', palette='deep', rc={'text.usetex': False})

	import bmp
	import utils

	pollutants = ['Cadmium, Total', 'Copper, Total', 'Lead, Total', 'Zinc, Total']
	BMPs = ['Bioretention', 'Detention Basin', 'Retention Pond', 'Wetland Basin']

	# load data
	database = bmp.Database("bmp/data/everything.csv", bmpcat_src="bmp/data/bmpcats.csv")

	# select the subset of data we care about
	# (data come x-tabbed with influent/effluent in the columns. The `stack`
	# command moves the data back into rows)
	metals = database.selectData(parameter=pollutants, category=BMPs)

	def getROS(group):
	mr = utils.ros.MR(group)
	return mr.data

	class DatasetCollection(object):
	def __init__(self, dataframe, rescol='res', qualcol='qual',
	stationcol='station', paramcol='parameter',
	othergroups=None, useROS=True):
	self.data = dataframe
	self._raw_rescol = rescol

	roscol = 'ros_' + rescol
	if useROS:
	self.rescol = roscol
	else:
	self.rescol = rescol

	self.qualcol = qualcol
	self.stationcol = stationcol
	self.paramcol = paramcol

	if othergroups is None:
	othergroups = []

	self.groupby = [stationcol, paramcol] + list([othergroups])
	self.columns = self.groupby + [self._raw_rescol, self.qualcol]

	self.tidy = self.data \
	.stack(level=self.stationcol) \
	.reset_index()[self.columns] \
	.groupby(by=self.groupby) \
	.apply(getROS) \
	.reset_index() \
	.rename(columns={'final_data': roscol})

	for c in self.tidy.columns:
	if c not in self.columns + [roscol]:
	self.tidy = self.tidy \
	.sort(columns=c) \
	.reset_index() \
	.drop([c, 'index'], axis=1)

	dc = DatasetCollection(metals, othergroups='category')
	dc.tidy[(dc.tidy.parameter=='Zinc, Total') & (dc.tidy.category == 'Bioretention')].head(10)