Skip to content

Instantly share code, notes, and snippets.

@whym
Created October 17, 2011 07:29
Show Gist options
  • Select an option

  • Save whym/1292129 to your computer and use it in GitHub Desktop.

Select an option

Save whym/1292129 to your computer and use it in GitHub Desktop.
splitting revision diffs
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# splitting revision diffs into files whose file names are the revision IDs
# see http://meta.wikimedia.org/wiki/WSoR_datasets/revision_diff for the input
import csv
import argparse
import sys
import os
import ast
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', metavar='DIRECTORY',
dest='output', type=str, required=True,
help='')
parser.add_argument('-i', '--input', metavar='FILE',
dest='input', type=lambda x: open(x, 'rU'), default=sys.stdin,
help='')
parser.add_argument('-v', '--verbose',
dest='verbose', action='store_true', default=False,
help='turn on verbose message output')
options = parser.parse_args()
csv.field_size_limit(100000000000)
for row in csv.reader(options.input, delimiter="\t"):
rev,page,namespace,title,timestamp,comment,minor,user,username = row[0:9]
diff = row[9:]
writer = open(os.sep.join([options.output,rev]), 'w')
print >>writer, diff
print >>sys.stderr, rev
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment