Created
March 13, 2012 06:34
-
-
Save whym/2027257 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# extract shortest pages (for MediaWiki/Wikimedia) | |
import oursql | |
import os | |
import argparse | |
import sys | |
import csv | |
import re | |
from collections import namedtuple | |
from datetime import datetime, timedelta | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-o', '--output', metavar='FILE', | |
dest='output', type=lambda x: open(x, 'w'), default=sys.stdout, | |
help='') | |
parser.add_argument('-l', '--limit', metavar='N', default=4000, | |
dest='limit', type=int, | |
help='maximum number of results') | |
parser.add_argument('-s', '--size', metavar='N', default=128, | |
dest='len', type=int, | |
help='maximum page size') | |
parser.add_argument('-d', '--db', metavar='DBNAME', required=True, | |
dest='db', type=str, | |
help='target wiki name') | |
parser.add_argument('-P','--ignore-pages', metavar='PATTERN', default=[], | |
dest='ignorep', action='append', | |
help='ignore these pages') | |
parser.add_argument('-C', '--ignore-categories', metavar='PATTERN', default=[], | |
dest='ignorec', action='append', | |
help='ignore pages in these categories') | |
options = parser.parse_args() | |
options.db = options.db.replace('_','-') | |
host = options.db + '.rrdb.toolserver.org' | |
conn = oursql.connect(host = host, | |
read_default_file=os.path.expanduser('~/.my.cnf'), | |
db = options.db.replace('-','_'), | |
charset=None, | |
use_unicode=False) | |
cursor = conn.cursor() | |
condpages = ' AND '.join(['page_title not like "%s"' % x for x in options.ignorep]) | |
condcats = ' AND '.join(['cl_to not like "%s"' % x for x in options.ignorec]) | |
cursor.execute(''' | |
SELECT page_title,rd_namespace,rd_title,count(pa.pl_from) FROM redirect | |
JOIN page ON page_id = rd_from | |
LEFT JOIN pagelinks AS pa ON pa.pl_title = page_title AND pa.pl_namespace = page_namespace | |
WHERE rd_namespace != 0 | |
AND page_namespace = 0 | |
AND page_title not like "_:%%" | |
AND page_title not like "__:%%" | |
AND page_title not like "___:%%" | |
GROUP BY pl_title, pl_namespace | |
LIMIT %d | |
; | |
''' % options.limit) | |
writer = csv.writer(options.output, delimiter='\t') | |
for col in list(cursor): | |
writer.writerow(list(col)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment