Created
September 5, 2014 05:16
-
-
Save jnothman/7db6d077234a55714c23 to your computer and use it in GitHub Desktop.
Extract and list json paths
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Faced with a collection of JSON blobs, this script lists what | |
paths (i.e. sequences of nested keys) exist in the data from | |
root to leaf. | |
For example: | |
$ echo '[{"a": {"a1": 124}, "b": 111}, {"a": {"a2": 111}, "c": null}]' \ | |
| list-json-paths.py | |
will output: | |
[] "a" "a1" | |
[] "a" | |
[] "b" | |
[] | |
[] "a" "a2" | |
[] "a" | |
[] "c" | |
[] | |
in which [] indicates a list element. Passing this output to: | |
$ awk '{counts[$0] += 1} END {for (k in counts) {print counts[k] "\t" k}}' | |
yields aggregate frequencies of each path: | |
1 | |
1 [] "b" | |
1 [] "c" | |
2 [] | |
1 [] "a" "a1" | |
1 [] "a" "a2" | |
2 [] "a" | |
""" | |
from __future__ import print_function | |
import sys | |
import re | |
from json import JSONDecoder, dumps | |
def _non_ws_index(s, start=0, WS=re.compile(r'\s*')): | |
return WS.match(s, start).end() | |
def read_multi_json(s): | |
# thanks to http://stackoverflow.com/questions/8730119 | |
decode = JSONDecoder().raw_decode | |
if hasattr(s, 'read'): | |
s = s.read() | |
end = _non_ws_index(s) | |
while end != len(s): | |
# decoder doesn't expect leading whitespace | |
obj, end = decode(s, idx=end) | |
yield obj | |
end = _non_ws_index(s, end) | |
def extract_paths(obj): | |
if isinstance(obj, list): | |
for el in obj: | |
for path in extract_paths(el): | |
yield ('[]',) + path | |
elif isinstance(obj, dict): | |
for k, v in obj.iteritems(): | |
k = dumps(k) | |
for path in extract_paths(v): | |
yield (k,) + path | |
yield () | |
def main(f): | |
for obj in read_multi_json(f): | |
for path in extract_paths(obj): | |
print(*path, sep='\t') | |
if __name__ == '__main__': | |
main(sys.stdin) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment