Last active
May 9, 2017 12:21
-
-
Save BibMartin/b0219727266515fa2af059df7f75b967 to your computer and use it in GitHub Desktop.
Hack pandas.DataFrame to have unstacked JSON structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import DataFrame, MultiIndex | |
def __getattribute__(self, x): | |
try: | |
return _parent__getattribute__(self, x) | |
except AttributeError: | |
columns = _parent__getattribute__(self, 'columns') | |
cols = list(set([x.split('.')[0] for x in columns])) | |
if x in cols: | |
_prefix = x + '.' | |
df = self[[x for x in columns if x.startswith(_prefix)]].rename_axis( | |
lambda x: x[len(_prefix):], axis=1) | |
if len(_parent__getattribute__(df, 'columns')): | |
return df | |
else: | |
return self[x] | |
else: | |
raise ValueError('{} not in {}'.format(x, cols)) | |
_parent__getattribute__ = DataFrame.__getattribute__ | |
DataFrame.__getattribute__ = __getattribute__ | |
def __dir__(self): | |
cols = list(set([x.split('.')[0] for x in self.columns])) | |
return _parent__dir__(self) + list(cols) | |
_parent__dir__ = DataFrame.__dir__ | |
DataFrame.__dir__ = __dir__ | |
def unstack(x, prefix=""): | |
if isinstance(x, dict): | |
out = {} | |
for key, val in x.items(): | |
z = unstack(val, prefix=key+'.') | |
if isinstance(z, dict): | |
for subkey, subval in z.items(): | |
out[prefix+subkey] = subval | |
else: | |
out[prefix+key] = val | |
return out | |
elif isinstance(x, list): | |
return unstack({'_'+str(i): val for i,val in enumerate(x)}, prefix=prefix) | |
else: | |
return x | |
def unwind(self): | |
columns = MultiIndex.from_tuples([tuple(col.split('.', 1)) | |
for col in self.columns]) | |
return DataFrame(self.values, | |
index=self.index, | |
columns=columns) | |
DataFrame.unwind = unwind | |
# Example | |
######### | |
data = [{'state': 'Florida', | |
'shortname': 'FL', | |
'info': { | |
'governor': 'Rick Scott' | |
}, | |
'counties': [{'name': 'Dade', 'population': 12345}, | |
{'name': 'Broward', 'population': 40000}, | |
{'name': 'Palm Beach', 'population': 60000}]}, | |
{'state': 'Ohio', | |
'shortname': 'OH', | |
'info': { | |
'governor': 'John Kasich' | |
}, | |
'counties': [{'name': 'Summit', 'population': 1234}, | |
{'name': 'Cuyahoga', 'population': 1337}]}] | |
df = DataFrame([unstack(x) for x in data]) | |
print(df.counties._0) | |
print(df.counties.unwind().stack(0)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment