Last active
February 15, 2016 13:05
-
-
Save d1manson/7d7b60fe508ba3e748a4 to your computer and use it in GitHub Desktop.
pandas Row_Indexer - syntactic sugar (with large overhead) for assigning values to columns within a single row of a DataFrame
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class RowIndexer(object): | |
def __init__(self, obj, idx, **kwargs): | |
""" | |
This class is syntactic sugar for performing a slow iteration over | |
rows in a pandas DataFrame, where each row needs to have some (slow) | |
computation performed and the results assigned to multiple columns | |
in the row. Here is a bare-bones example:: | |
df = ... #create a DataFram | |
theRow = RowIndexer(df, 3) # here we explicitly choose row 3 | |
theRow.something = "hello" | |
theRow.other_thing = 99.2 | |
# df.something[3]: "hello" | |
# df.other_thing[3]: 99.2 | |
The intended useage case is within a generator, which yields a new | |
RowIndexer each time. | |
In addition to the parent DataFrame and index, you can also provide | |
a list of kwargs, which should be added to the RowIndexer as read-only | |
attributes. | |
If the dataframe's index has names, you can access those by name on | |
the indexer (e.g. if ``'day'`` is one of the levels, then ``theRow.day`` | |
will return the value of that level of the index). | |
""" | |
object.__setattr__(self, '_obj', obj) | |
object.__setattr__(self, '_idx', idx) | |
object.__setattr__(self, '_with_column_sub', None) | |
for k, v in kwargs.iteritems(): | |
object.__setattr__(self, k, v) | |
def __getattr__(self, key): | |
# expand key into tuple if multi indexed columns exist or column_sub in use | |
possible_keys = [key] | |
if self._with_column_sub is not None: | |
sub_idx = self._with_column_sub.index('[attr]') | |
possible_keys.append(self._with_column_sub[:sub_idx] + (key,) + self._with_column_sub[sub_idx+1:]) | |
if self._obj.columns.nlevels > 1 and not isinstance(key, tuple): | |
possible_keys.append((key,) + ('',)* (self._obj.columns.nlevels-1)) | |
# try a simple column indexing, then index names, then index names without sub | |
for key in possible_keys: | |
try: | |
return self._obj.get_value(self._idx, key) | |
except: | |
pass | |
try: | |
return self._idx[list(self._obj.index.names).index(key)] | |
except: | |
pass | |
raise KeyError("Could not find key '" + "' or '".join(possible_keys) + | |
"' in column or index names.") | |
def __getitem__(self, key): | |
return self.__getattr__(key) | |
def __setattr__(self, key, value): | |
if self._with_column_sub is not None: | |
sub_idx = self._with_column_sub.index('[attr]') | |
key = self._with_column_sub[:sub_idx] + (key,) + self._with_column_sub[sub_idx+1:] | |
elif self._obj.columns.nlevels > 1: | |
key = (key,) + ('',) * (self._obj.columns.nlevels-1) | |
try: | |
self._obj.set_value(self._idx, key, value) | |
except TypeError: | |
# this execption seems to occur when value is iterable and the key | |
# does not yet exist. To overcome this, we create the key and try | |
# again. | |
self._obj[key] = self._obj[key].get_values().astype(object) | |
self._obj.set_value(self._idx, key, value) | |
def _update(self, dict_): | |
"""convenience method, equivalent to looping over values in dict and | |
setting them as attributes on this row. | |
""" | |
for k, v in dict_.items(): | |
self.__setattr__(k, v) | |
@contextmanager | |
def column_sub(self, *args, **kwargs): | |
"""Example:: | |
row.usa = 21 | |
with row.column_sub('europe'): | |
row.uk = 34 # sets row['europe', 'uk'] | |
with row.column_sub('[attr]', 'male') | |
row.mean_age = 21 # sets row['mean_age', 'male'] | |
applies to both getting and setting. | |
If the required number of index levels is greater than that already | |
in use, the existing column names will be padded as specified by kwarg | |
``padding=''`, with the padding added "below" existing names, this | |
matches what reset index seems to do. | |
You may want to do df.sort_index(axis=1, inplace=True) when you are finsihed. | |
""" | |
if not args or len([a for a in args if a =='[attr]']) > len(args)-1: | |
raise ValueError("you must specify at most one '[attr]' spot, and " | |
"at leat one sub label.") | |
if 'padding' in kwargs: | |
padding = kwargs['padding'] | |
if len(padding) > 1: | |
raise ValueError("urecorgnised wkargs") | |
else: | |
padding = "" | |
if kwargs: | |
raise ValueError("unrecognised kwargs") | |
# establish total levels required | |
if '[attr]' not in args: | |
args = args + ('[attr]',) | |
target_nlevels = len(args) | |
_pad_columns_to_level(self._obj, target_nlevels, padding) | |
object.__setattr__(self, '_with_column_sub', tuple(args)) | |
try: | |
yield | |
finally: | |
object.__setattr__(self, '_with_column_sub', None) | |
def __dir__(self): | |
ret = set(self._obj.columns.values) | |
ret.update(self.__dict__.keys()) | |
ret.update(dir(type(self))) | |
return list(ret) | |
def __repr__(self): | |
ret = repr(self._obj.loc[self._idx]) | |
if self._with_column_sub: | |
ret += "\nwith sub: " + str(self._with_column_sub) | |
return ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment