Created
August 15, 2023 14:27
-
-
Save guyromm/d9b82ac84289e6683a93d4ee5bbe1959 to your computer and use it in GitHub Desktop.
analyze keys and data types in a JSONL coming into stdin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import json | |
import collections | |
from typing import Any, Dict, List, Union | |
def get_type(value: Any) -> str: | |
return type(value).__name__ | |
def update_summary(item: Union[Dict[str, Any], List[Any]], path: str = ""): | |
if isinstance(item, dict): | |
for key, value in item.items(): | |
key_path = f"{path}.{key}" if path else key | |
if isinstance(value, (dict, list)): | |
update_summary(value, key_path) | |
else: | |
summary["keys"][key_path] += 1 | |
kp = f"{key_path}:{get_type(value)}" | |
summary["key_types"][kp] += 1 | |
if kp not in samples: | |
samples[kp]=collections.Counter() | |
samples[kp][value]+=1 | |
elif isinstance(item, list): | |
key_path = f"{path}.ARR" if path else "ARR" | |
summary["keys"][key_path] += len(item) | |
for value in item: | |
if isinstance(value, (dict, list)): | |
update_summary(value, key_path) | |
def order_by_commonality(counter): | |
return {k: v for k, v in sorted(counter.items(), key=lambda item: item[1])} | |
summary = { | |
"key_types": collections.Counter(), | |
"keys": collections.Counter(), | |
} | |
samples = {} | |
data = [json.loads(line) for line in sys.stdin] | |
for item in data: | |
update_summary(item) | |
summary = {k: order_by_commonality(v) for k, v in summary.items()} | |
print(json.dumps(samples, indent=4)) | |
print(json.dumps(summary, indent=4)) | |
for k,v in summary['key_types'].items(): | |
s = samples[k] | |
print(k.ljust(70),str(v).rjust(10),str(len(s)).rjust(8),list(s.items())[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment