Skip to content

Instantly share code, notes, and snippets.

@jnothman
Created September 5, 2014 05:16
Show Gist options
  • Save jnothman/7db6d077234a55714c23 to your computer and use it in GitHub Desktop.
Save jnothman/7db6d077234a55714c23 to your computer and use it in GitHub Desktop.
Extract and list json paths
#!/usr/bin/env python
"""
Faced with a collection of JSON blobs, this script lists what
paths (i.e. sequences of nested keys) exist in the data from
root to leaf.
For example:
$ echo '[{"a": {"a1": 124}, "b": 111}, {"a": {"a2": 111}, "c": null}]' \
| list-json-paths.py
will output:
[] "a" "a1"
[] "a"
[] "b"
[]
[] "a" "a2"
[] "a"
[] "c"
[]
in which [] indicates a list element. Passing this output to:
$ awk '{counts[$0] += 1} END {for (k in counts) {print counts[k] "\t" k}}'
yields aggregate frequencies of each path:
1
1 [] "b"
1 [] "c"
2 []
1 [] "a" "a1"
1 [] "a" "a2"
2 [] "a"
"""
from __future__ import print_function
import sys
import re
from json import JSONDecoder, dumps
def _non_ws_index(s, start=0, WS=re.compile(r'\s*')):
return WS.match(s, start).end()
def read_multi_json(s):
# thanks to http://stackoverflow.com/questions/8730119
decode = JSONDecoder().raw_decode
if hasattr(s, 'read'):
s = s.read()
end = _non_ws_index(s)
while end != len(s):
# decoder doesn't expect leading whitespace
obj, end = decode(s, idx=end)
yield obj
end = _non_ws_index(s, end)
def extract_paths(obj):
if isinstance(obj, list):
for el in obj:
for path in extract_paths(el):
yield ('[]',) + path
elif isinstance(obj, dict):
for k, v in obj.iteritems():
k = dumps(k)
for path in extract_paths(v):
yield (k,) + path
yield ()
def main(f):
for obj in read_multi_json(f):
for path in extract_paths(obj):
print(*path, sep='\t')
if __name__ == '__main__':
main(sys.stdin)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment