Skip to content

Instantly share code, notes, and snippets.

@d1manson
Last active February 15, 2016 13:05
Show Gist options
  • Save d1manson/7d7b60fe508ba3e748a4 to your computer and use it in GitHub Desktop.
Save d1manson/7d7b60fe508ba3e748a4 to your computer and use it in GitHub Desktop.
pandas Row_Indexer - syntactic sugar (with large overhead) for assigning values to columns within a single row of a DataFrame
class RowIndexer(object):
def __init__(self, obj, idx, **kwargs):
"""
This class is syntactic sugar for performing a slow iteration over
rows in a pandas DataFrame, where each row needs to have some (slow)
computation performed and the results assigned to multiple columns
in the row. Here is a bare-bones example::
df = ... #create a DataFram
theRow = RowIndexer(df, 3) # here we explicitly choose row 3
theRow.something = "hello"
theRow.other_thing = 99.2
# df.something[3]: "hello"
# df.other_thing[3]: 99.2
The intended useage case is within a generator, which yields a new
RowIndexer each time.
In addition to the parent DataFrame and index, you can also provide
a list of kwargs, which should be added to the RowIndexer as read-only
attributes.
If the dataframe's index has names, you can access those by name on
the indexer (e.g. if ``'day'`` is one of the levels, then ``theRow.day``
will return the value of that level of the index).
"""
object.__setattr__(self, '_obj', obj)
object.__setattr__(self, '_idx', idx)
object.__setattr__(self, '_with_column_sub', None)
for k, v in kwargs.iteritems():
object.__setattr__(self, k, v)
def __getattr__(self, key):
# expand key into tuple if multi indexed columns exist or column_sub in use
possible_keys = [key]
if self._with_column_sub is not None:
sub_idx = self._with_column_sub.index('[attr]')
possible_keys.append(self._with_column_sub[:sub_idx] + (key,) + self._with_column_sub[sub_idx+1:])
if self._obj.columns.nlevels > 1 and not isinstance(key, tuple):
possible_keys.append((key,) + ('',)* (self._obj.columns.nlevels-1))
# try a simple column indexing, then index names, then index names without sub
for key in possible_keys:
try:
return self._obj.get_value(self._idx, key)
except:
pass
try:
return self._idx[list(self._obj.index.names).index(key)]
except:
pass
raise KeyError("Could not find key '" + "' or '".join(possible_keys) +
"' in column or index names.")
def __getitem__(self, key):
return self.__getattr__(key)
def __setattr__(self, key, value):
if self._with_column_sub is not None:
sub_idx = self._with_column_sub.index('[attr]')
key = self._with_column_sub[:sub_idx] + (key,) + self._with_column_sub[sub_idx+1:]
elif self._obj.columns.nlevels > 1:
key = (key,) + ('',) * (self._obj.columns.nlevels-1)
try:
self._obj.set_value(self._idx, key, value)
except TypeError:
# this execption seems to occur when value is iterable and the key
# does not yet exist. To overcome this, we create the key and try
# again.
self._obj[key] = self._obj[key].get_values().astype(object)
self._obj.set_value(self._idx, key, value)
def _update(self, dict_):
"""convenience method, equivalent to looping over values in dict and
setting them as attributes on this row.
"""
for k, v in dict_.items():
self.__setattr__(k, v)
@contextmanager
def column_sub(self, *args, **kwargs):
"""Example::
row.usa = 21
with row.column_sub('europe'):
row.uk = 34 # sets row['europe', 'uk']
with row.column_sub('[attr]', 'male')
row.mean_age = 21 # sets row['mean_age', 'male']
applies to both getting and setting.
If the required number of index levels is greater than that already
in use, the existing column names will be padded as specified by kwarg
``padding=''`, with the padding added "below" existing names, this
matches what reset index seems to do.
You may want to do df.sort_index(axis=1, inplace=True) when you are finsihed.
"""
if not args or len([a for a in args if a =='[attr]']) > len(args)-1:
raise ValueError("you must specify at most one '[attr]' spot, and "
"at leat one sub label.")
if 'padding' in kwargs:
padding = kwargs['padding']
if len(padding) > 1:
raise ValueError("urecorgnised wkargs")
else:
padding = ""
if kwargs:
raise ValueError("unrecognised kwargs")
# establish total levels required
if '[attr]' not in args:
args = args + ('[attr]',)
target_nlevels = len(args)
_pad_columns_to_level(self._obj, target_nlevels, padding)
object.__setattr__(self, '_with_column_sub', tuple(args))
try:
yield
finally:
object.__setattr__(self, '_with_column_sub', None)
def __dir__(self):
ret = set(self._obj.columns.values)
ret.update(self.__dict__.keys())
ret.update(dir(type(self)))
return list(ret)
def __repr__(self):
ret = repr(self._obj.loc[self._idx])
if self._with_column_sub:
ret += "\nwith sub: " + str(self._with_column_sub)
return ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment