Last active
August 29, 2015 14:07
-
-
Save karimofthecrop/48c8c823d0ae57399467 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Takes in some JSON objects (one per line) on stdin | |
# Outputs a TSV on stdout | |
# Pipe it to a file or pipe it to your favorite shell utility | |
import codecs | |
import json | |
import string | |
import sys | |
from optparse import OptionParser | |
def parse_line(line, columns): | |
# remove UTF-8 byte alignment characters (they mess up unicode encoding) | |
line = unicode(line.strip(codecs.BOM_UTF8), 'utf-8') | |
line_hash = json.loads(line) | |
output_col = [] | |
for col in columns: | |
try: | |
output_col += [unicode(line_hash[col])] | |
except KeyError: | |
output_col += [unicode('')] | |
tab = unicode("\t") | |
output_str = tab.join(output_col) | |
# unless you encode output_str as UTF-8, you can get encoding errors | |
# (don't ask me why a unicode string joined to a unicode string isn't UTF-8) | |
output_str = output_str.encode('utf-8') | |
sys.stdout.write( output_str ) | |
sys.stdout.write("\n") | |
#print output_str | |
return | |
def get_columns(line, want_columns): | |
line_hash = json.loads(line) | |
# if no columns are specified, get all possible keys | |
if want_columns == None: | |
ret_columns = line_hash.keys() | |
ret_columns.sort() | |
else: | |
ret_columns = want_columns | |
return ret_columns | |
def main(): | |
parser = OptionParser() | |
# option for comma-separated list of column headers/key names | |
# ex. --cols=a,b,c | |
parser.add_option("--cols", action="store", type="string", dest="columns") | |
(options, args) = parser.parse_args() | |
want_columns = None | |
if options.columns != None: | |
want_columns = options.columns.split(",") | |
try: | |
for line in sys.stdin: | |
columns = get_columns(line, want_columns) | |
parse_line(line, columns) | |
except KeyboardInterrupt: | |
sys.exit() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment