Created
June 5, 2018 14:27
-
-
Save krosaen/64e2a3012fb4a9c65a6ccb8697e2779f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def summarize_data_shape(example): | |
""" | |
Given some (json serializeable) example, provide a concise summary of its structure | |
by pruning it down to e.g one item per list. | |
like https://github.com/krosaen/data-shapy/blob/master/data_shapy/data_shape.py | |
but handles a best effort summary of class objects and tuples too | |
""" | |
def is_ground(item): | |
return any([ | |
item is None, | |
isinstance(item, bool), | |
isinstance(item, int), | |
isinstance(item, str), | |
isinstance(item, float), | |
]) | |
def non_none_kv_count(d): | |
return len([v for k, v in d.items() if v is not None]) | |
def max_index(l): | |
max_i = 0 | |
max_v = l[0] | |
for i, el in enumerate(l): | |
if el > max_v: | |
max_i = i | |
max_v = el | |
return max_i | |
if is_ground(example): | |
return example | |
if isinstance(example, np.ndarray): | |
return "np {}".format('x'.join(map(str, example.shape))) | |
if isinstance(example, bytes): | |
return "bytes[{}]".format(len(example)) | |
if isinstance(example, tuple) and hasattr(example, '_fields'): | |
# named tuple | |
result = summarize_data_shape(example._asdict()) | |
# result['cls'] = example.__class__ | |
return result | |
if isinstance(example, list) or isinstance(example, tuple): | |
if len(example) == 2 and is_ground(example[0]): | |
return [example[0], summarize_data_shape(example[1])] | |
if len(example) < 10 and all(map(is_ground, example)): | |
return example | |
elif all([isinstance(el, dict) for el in example[:20]]): | |
# we have a list of dicts, find one to summarize that has the most non-null key values | |
# (looking 20 items out max) | |
non_none_kv_counts = [non_none_kv_count(d) for d in example[:20]] | |
return [summarize_data_shape(example[max_index(non_none_kv_counts)])] | |
else: | |
return [summarize_data_shape(example[0])] | |
if isinstance(example, set): | |
return summarize_data_shape(list(example)) | |
if isinstance(example, dict) and len(example) == 1 and 'matrix' in example: | |
return summarize_data_shape(np.array(example['matrix'])) | |
if isinstance(example, dict): | |
return {k: summarize_data_shape(v) for k, v in example.items()} | |
if isinstance(example, object): | |
ks = set(example.__dict__.keys()) - set(example.__class__.__dict__.keys()) | |
result = {k: summarize_data_shape(example.__getattribute__(k)) for k in ks} | |
result['cls'] = example.__class__ | |
return result | |
raise ValueError("dunno how to deal with type {}".format(type(example))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment