Last active
September 13, 2015 14:52
-
-
Save pezon/659c0a987df1e89c00bf to your computer and use it in GitHub Desktop.
Proof of Concept: Convert JSON object to tabular data using ijson.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Proof of concept: json-to-table | |
Convert JSON object to tabular data using ijson, | |
which provides a SAX-like iterable for JSON objects | |
""" | |
from urlparse import urlparse | |
from collections import Counter, OrderedDict | |
import ijson | |
import requests | |
format_key = lambda key: key.split('.')[-1].lower() | |
ignore_events = ['map_key', 'start_map', 'end_map', 'start_array', 'end_array'] | |
def main(): | |
url = 'http://annacw.annapolis.gov/ArcGIS/rest/services/maps/Crimes/MapServer/0/query?f=json&where=(category%20like%20%27%25ASSAULT%25%27%20or%20category%20like%20%27%25BURGLARY%25%27%20or%20category%20like%20%27%25HOMICIDE%25%27%20or%20category%20like%20%27%25MOTOR%20VEHICLE%25%27%20or%20category%20like%20%27%25ROBBERY%25%27%20or%20category%20like%20%27%25THEFT%25%27)%20and%20REPORTED_ON%20between%20date%20%2708%2F13%2F15%27%20and%20date%20%279%2F12%2F2015%27&returnGeometry=true&spatialRel=esriSpatialRelIntersects&outFields=Category%2CCALL_CODE_1%2CINCIDENT_NUM%2CSTREETADDR%2CREPORTED_ON' | |
stream = StringIO(requests.get(url).text) | |
path, keys = guess_iterable(stream) | |
return create_table(stream, path=path, keys=keys) | |
def guess_iterable(stream): | |
keys = set() | |
depth = Counter() | |
stream.seek(0) | |
for prefix, event, value in ijson.parse(stream): | |
if event in ignore_events: | |
continue | |
d = len(prefix.split('.')) | |
depth[d] += 1 | |
keys.add(prefix) | |
iter_depth = depth.most_common()[1][0] if len(depth) > 1 else 0 | |
path = '.'.join(keys[0].split('.', iter_depth)[0:iter_depth]) | |
keys = [format_key(key) for key in keys if key.startswith(path)] | |
return path, keys | |
def create_table(stream, path, keys): | |
obj_list = [] | |
stream.seek(0) | |
for prefix, event, value in ijson.parse(stream): | |
if prefix == root_key: | |
if event == 'start_map': | |
obj = OrderedDict.fromkeys(keys) | |
elif event == 'end_map': | |
obj_list.append(obj) | |
elif event in ignore_events or prefix.startswith(path): | |
key = format_key(prefix) | |
obj[key] = value | |
return obj_list | |
def get_object_keys(stream, formatter=None, exclude=None): | |
if not formatter: | |
formatter = lambda key: key | |
if not exclude: | |
exclude = lambda key: False | |
keys = set() | |
stream.seek(0) | |
for prefix, event, value in ijson.parse(stream): | |
if event in ignore_events: | |
continue | |
keys.add(prefix) | |
return [formatter(key) for key in keys if not exclude(key)] | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment