Created
October 17, 2011 07:29
-
-
Save whym/1292129 to your computer and use it in GitHub Desktop.
splitting revision diffs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # splitting revision diffs into files whose file names are the revision IDs | |
| # see http://meta.wikimedia.org/wiki/WSoR_datasets/revision_diff for the input | |
| import csv | |
| import argparse | |
| import sys | |
| import os | |
| import ast | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-o', '--output', metavar='DIRECTORY', | |
| dest='output', type=str, required=True, | |
| help='') | |
| parser.add_argument('-i', '--input', metavar='FILE', | |
| dest='input', type=lambda x: open(x, 'rU'), default=sys.stdin, | |
| help='') | |
| parser.add_argument('-v', '--verbose', | |
| dest='verbose', action='store_true', default=False, | |
| help='turn on verbose message output') | |
| options = parser.parse_args() | |
| csv.field_size_limit(100000000000) | |
| for row in csv.reader(options.input, delimiter="\t"): | |
| rev,page,namespace,title,timestamp,comment,minor,user,username = row[0:9] | |
| diff = row[9:] | |
| writer = open(os.sep.join([options.output,rev]), 'w') | |
| print >>writer, diff | |
| print >>sys.stderr, rev |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment