Skip to content

Instantly share code, notes, and snippets.

@whym
Created March 13, 2012 06:34
Show Gist options
  • Save whym/2027257 to your computer and use it in GitHub Desktop.
Save whym/2027257 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# extract shortest pages (for MediaWiki/Wikimedia)
import oursql
import os
import argparse
import sys
import csv
import re
from collections import namedtuple
from datetime import datetime, timedelta
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', metavar='FILE',
dest='output', type=lambda x: open(x, 'w'), default=sys.stdout,
help='')
parser.add_argument('-l', '--limit', metavar='N', default=4000,
dest='limit', type=int,
help='maximum number of results')
parser.add_argument('-s', '--size', metavar='N', default=128,
dest='len', type=int,
help='maximum page size')
parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
dest='db', type=str,
help='target wiki name')
parser.add_argument('-P','--ignore-pages', metavar='PATTERN', default=[],
dest='ignorep', action='append',
help='ignore these pages')
parser.add_argument('-C', '--ignore-categories', metavar='PATTERN', default=[],
dest='ignorec', action='append',
help='ignore pages in these categories')
options = parser.parse_args()
options.db = options.db.replace('_','-')
host = options.db + '.rrdb.toolserver.org'
conn = oursql.connect(host = host,
read_default_file=os.path.expanduser('~/.my.cnf'),
db = options.db.replace('-','_'),
charset=None,
use_unicode=False)
cursor = conn.cursor()
condpages = ' AND '.join(['page_title not like "%s"' % x for x in options.ignorep])
condcats = ' AND '.join(['cl_to not like "%s"' % x for x in options.ignorec])
cursor.execute('''
SELECT page_title,rd_namespace,rd_title,count(pa.pl_from) FROM redirect
JOIN page ON page_id = rd_from
LEFT JOIN pagelinks AS pa ON pa.pl_title = page_title AND pa.pl_namespace = page_namespace
WHERE rd_namespace != 0
AND page_namespace = 0
AND page_title not like "_:%%"
AND page_title not like "__:%%"
AND page_title not like "___:%%"
GROUP BY pl_title, pl_namespace
LIMIT %d
;
''' % options.limit)
writer = csv.writer(options.output, delimiter='\t')
for col in list(cursor):
writer.writerow(list(col))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment