Created
January 27, 2016 11:32
-
-
Save FedericoPonzi/e188a2315120a6ece2a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Convert the Yelp Dataset Challenge dataset from json format to csv. | |
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge | |
""" | |
import argparse | |
import collections | |
import csv | |
import simplejson as json | |
def read_and_write_file(json_file_path, csv_file_path, column_names): | |
"""Read in the json dataset file and write it out to a csv file, given the column names.""" | |
with open(csv_file_path, 'wb+') as fout: | |
csv_file = csv.writer(fout) | |
csv_file.writerow(list(column_names)) | |
with open(json_file_path) as fin: | |
for line in fin: | |
line_contents = json.loads(line) | |
csv_file.writerow(get_row(line_contents, column_names)) | |
def get_superset_of_column_names_from_file(json_file_path): | |
"""Read in the json dataset file and return the superset of column names.""" | |
column_names = set() | |
with open(json_file_path) as fin: | |
for line in fin: | |
line_contents = json.loads(line) | |
column_names.update( | |
set(get_column_names(line_contents).keys()) | |
) | |
return column_names | |
def get_column_names(line_contents, parent_key=''): | |
"""Return a list of flattened key names given a dict. | |
Example: | |
line_contents = { | |
'a': { | |
'b': 2, | |
'c': 3, | |
}, | |
} | |
will return: ['a.b', 'a.c'] | |
These will be the column names for the eventual csv file. | |
""" | |
column_names = [] | |
for k, v in line_contents.iteritems(): | |
column_name = "{0}.{1}".format(parent_key, k) if parent_key else k | |
if isinstance(v, collections.MutableMapping): | |
column_names.extend( | |
get_column_names(v, column_name).items() | |
) | |
else: | |
column_names.append((column_name, v)) | |
return dict(column_names) | |
def get_nested_value(d, key): | |
"""Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`. | |
Example: | |
d = { | |
'a': { | |
'b': 2, | |
'c': 3, | |
}, | |
} | |
key = 'a.b' | |
will return: 2 | |
""" | |
if '.' not in key: | |
if key not in d: | |
return None | |
return d[key] | |
base_key, sub_key = key.split('.', 1) | |
if base_key not in d: | |
return None | |
sub_dict = d[base_key] | |
return get_nested_value(sub_dict, sub_key) | |
def get_row(line_contents, column_names): | |
"""Return a csv compatible row given column names and a dict.""" | |
row = [] | |
for column_name in column_names: | |
line_value = get_nested_value( | |
line_contents, | |
column_name, | |
) | |
if isinstance(line_value, unicode): | |
row.append('{0}'.format(line_value.encode('utf-8'))) | |
elif line_value is not None: | |
row.append('{0}'.format(line_value)) | |
else: | |
row.append('') | |
return row | |
if __name__ == '__main__': | |
"""Convert a yelp dataset file from json to csv.""" | |
parser = argparse.ArgumentParser( | |
description='Convert Yelp Dataset Challenge data from JSON format to CSV.', | |
) | |
parser.add_argument( | |
'json_file', | |
type=str, | |
help='The json file to convert.', | |
) | |
args = parser.parse_args() | |
json_file = args.json_file | |
csv_file = '{0}.csv'.format(json_file.split('.json')[0]) | |
column_names = get_superset_of_column_names_from_file(json_file) | |
read_and_write_file(json_file, csv_file, column_names) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment