Skip to content

Instantly share code, notes, and snippets.

@guyromm
Created August 15, 2023 14:27
Show Gist options
  • Save guyromm/d9b82ac84289e6683a93d4ee5bbe1959 to your computer and use it in GitHub Desktop.
Save guyromm/d9b82ac84289e6683a93d4ee5bbe1959 to your computer and use it in GitHub Desktop.
analyze keys and data types in a JSONL coming into stdin
#!/usr/bin/env python
import sys
import json
import collections
from typing import Any, Dict, List, Union
def get_type(value: Any) -> str:
return type(value).__name__
def update_summary(item: Union[Dict[str, Any], List[Any]], path: str = ""):
if isinstance(item, dict):
for key, value in item.items():
key_path = f"{path}.{key}" if path else key
if isinstance(value, (dict, list)):
update_summary(value, key_path)
else:
summary["keys"][key_path] += 1
kp = f"{key_path}:{get_type(value)}"
summary["key_types"][kp] += 1
if kp not in samples:
samples[kp]=collections.Counter()
samples[kp][value]+=1
elif isinstance(item, list):
key_path = f"{path}.ARR" if path else "ARR"
summary["keys"][key_path] += len(item)
for value in item:
if isinstance(value, (dict, list)):
update_summary(value, key_path)
def order_by_commonality(counter):
return {k: v for k, v in sorted(counter.items(), key=lambda item: item[1])}
summary = {
"key_types": collections.Counter(),
"keys": collections.Counter(),
}
samples = {}
data = [json.loads(line) for line in sys.stdin]
for item in data:
update_summary(item)
summary = {k: order_by_commonality(v) for k, v in summary.items()}
print(json.dumps(samples, indent=4))
print(json.dumps(summary, indent=4))
for k,v in summary['key_types'].items():
s = samples[k]
print(k.ljust(70),str(v).rjust(10),str(len(s)).rjust(8),list(s.items())[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment