Created
May 17, 2012 14:24
-
-
Save brendano/2719239 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| r""" | |
| vertunion File1 File2 .... | |
| Iterates through parallel files, each row | |
| "DocID \t JSON1" "DocID \t JSON2" .... | |
| and outputs | |
| "DocID \t UnionOfJSONs" | |
| Union of key-value pairs, that is. | |
| Vaguely reminiscent of vertical concatenation. | |
| You can append key prefixes with a comma: | |
| vertunion File1,a File2,b | |
| then "a" and "b" will be used as prefixes for the keys in the output. | |
| TODO make work for output of 'hashjoin', looking like | |
| DocID \t JSON1 \t JSON2 \t JSON3 ... | |
| So a flattening operation across the board. | |
| """ | |
| import sys,json | |
| filespecs = sys.argv[1:] | |
| if not filespecs: | |
| print __doc__.strip() | |
| sys.exit(1) | |
| filenames = [f.split(',')[0] for f in filespecs] | |
| files = [sys.stdin if f=='-' else open(f) for f in filenames] | |
| prefixes = [f.split(',')[-1] if ',' in f else None for f in filespecs] | |
| while True: | |
| rows = [f.readline() for f in files] | |
| if all(not row for row in rows): break | |
| assert all(row for row in rows), "files didn't all end at same time" | |
| split_rows = [row.split('\t') for row in rows] | |
| docids = [r[0] for r in split_rows] | |
| assert len(set(docids))==1, "not all the same docid, not parallel: %s" % (docids,) | |
| records = [json.loads(row[1]) for row in split_rows] | |
| out_rec = {} | |
| for j in range(len(records)): | |
| for f,v in records[j].iteritems(): | |
| if prefixes[j]: | |
| f = '[%s]%s' % (prefixes[j], f) | |
| assert f not in out_rec, "repeated key {k} ... need to turn on prefixing, or else munge the keys".format(k=repr(f)) | |
| out_rec[f] = v | |
| print docids[0] + '\t' + json.dumps(out_rec) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment