jnothman · September 5, 2014 05:16
diff --git a/list-json-paths.py b/list-json-paths.py
 #!/usr/bin/env python
 """
 Faced with a collection of JSON blobs, this script lists what
 paths (i.e. sequences of nested keys) exist in the data from
 root to leaf.

 For example:
  $ echo '[{"a": {"a1": 124}, "b": 111}, {"a": {"a2": 111}, "c": null}]' \
      | list-json-paths.py
 will output:
  []      "a"     "a1"
  []      "a"
  []      "b"
  []
  []      "a"     "a2"
  []      "a"
  []      "c"
  []
 in which [] indicates a list element. Passing this output to:
  $ awk '{counts[$0] += 1} END {for (k in counts) {print counts[k] "\t" k}}'
 yields aggregate frequencies of each path:
  1
  1       []      "b"
  1       []      "c"
  2       []
  1       []      "a"     "a1"
  1       []      "a"     "a2"
  2       []      "a" 
 """

 from __future__ import print_function
 import sys
 import re
 from json import JSONDecoder, dumps


 def _non_ws_index(s, start=0, WS=re.compile(r'\s*')):
    return WS.match(s, start).end()


 def read_multi_json(s):
    # thanks to http://stackoverflow.com/questions/8730119
    decode = JSONDecoder().raw_decode
    if hasattr(s, 'read'):
        s = s.read()
    end = _non_ws_index(s)
    while end != len(s):
        # decoder doesn't expect leading whitespace
        obj, end = decode(s, idx=end)
        yield obj
        end = _non_ws_index(s, end)


 def extract_paths(obj):
    if isinstance(obj, list):
        for el in obj:
            for path in extract_paths(el):
                yield ('[]',) + path
    elif isinstance(obj, dict):
        for k, v in obj.iteritems():
            k = dumps(k)
            for path in extract_paths(v):
                yield (k,) + path
    yield ()


 def main(f):
    for obj in read_multi_json(f):
        for path in extract_paths(obj):
            print(*path, sep='\t')


 if __name__ == '__main__':
    main(sys.stdin)
	#!/usr/bin/env python
	"""
	Faced with a collection of JSON blobs, this script lists what
	paths (i.e. sequences of nested keys) exist in the data from
	root to leaf.

	For example:
	$ echo '[{"a": {"a1": 124}, "b": 111}, {"a": {"a2": 111}, "c": null}]' \
	\| list-json-paths.py
	will output:
	[] "a" "a1"
	[] "a"
	[] "b"
	[]
	[] "a" "a2"
	[] "a"
	[] "c"
	[]
	in which [] indicates a list element. Passing this output to:
	$ awk '{counts[$0] += 1} END {for (k in counts) {print counts[k] "\t" k}}'
	yields aggregate frequencies of each path:
	1
	1 [] "b"
	1 [] "c"
	2 []
	1 [] "a" "a1"
	1 [] "a" "a2"
	2 [] "a"
	"""

	from __future__ import print_function
	import sys
	import re
	from json import JSONDecoder, dumps


	def _non_ws_index(s, start=0, WS=re.compile(r'\s*')):
	return WS.match(s, start).end()


	def read_multi_json(s):
	# thanks to http://stackoverflow.com/questions/8730119
	decode = JSONDecoder().raw_decode
	if hasattr(s, 'read'):
	s = s.read()
	end = _non_ws_index(s)
	while end != len(s):
	# decoder doesn't expect leading whitespace
	obj, end = decode(s, idx=end)
	yield obj
	end = _non_ws_index(s, end)


	def extract_paths(obj):
	if isinstance(obj, list):
	for el in obj:
	for path in extract_paths(el):
	yield ('[]',) + path
	elif isinstance(obj, dict):
	for k, v in obj.iteritems():
	k = dumps(k)
	for path in extract_paths(v):
	yield (k,) + path
	yield ()


	def main(f):
	for obj in read_multi_json(f):
	for path in extract_paths(obj):
	print(*path, sep='\t')


	if __name__ == '__main__':
	main(sys.stdin)