Skip to content

Instantly share code, notes, and snippets.

@whym
Last active January 16, 2016 08:43
Show Gist options
  • Select an option

  • Save whym/1916946 to your computer and use it in GitHub Desktop.

Select an option

Save whym/1916946 to your computer and use it in GitHub Desktop.
extract red-linked pages with the highest numbers of incoming links (for MediaWiki/Wikimedia)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import fileinput
from datetime import datetime
print '<!-- generated: %s -->' % datetime.strftime(datetime.now(), '%Y-%m-%dT%H:%M:%SZ')
for line in fileinput.input():
ns,page,n = line.strip().split('\t')
print '# [[%s]] ([[特別:Whatlinkshere/%s|%s 個のリンク]])' % (page, page, n)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# extract red-linked pages with the highest numbers of incoming links (for MediaWiki/Wikimedia)
import oursql
import os
import argparse
import sys
import csv
import re
from collections import namedtuple
from datetime import datetime, timedelta
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', metavar='FILE',
dest='output', type=lambda x: open(x, 'w'), default=sys.stdout,
help='')
parser.add_argument('-l', '--limit', metavar='N', default=4000,
dest='limit', type=int,
help='maximum number of results')
parser.add_argument('-A', '--all-namespaces', default=False,
dest='allns', action='store_true')
parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
dest='db', type=str,
help='target wiki name')
parser.add_argument('-I', '--ignore', metavar='PATTERN', default='',
dest='ignore', type=str,
help='ignore incoming links from these pages')
options = parser.parse_args()
host = options.db.replace('_p', '') + '.labsdb'
conn = oursql.connect(host = host,
read_default_file=os.path.expanduser('~/.my.cnf'),
db = options.db.replace('-','_'),
charset=None,
use_unicode=False)
cursor = conn.cursor()
cond = []
if options.ignore:
cond.append('pfrom.page_title NOT LIKE "%s"' % options.ignore)
if not options.allns:
cond.append('pl_namespace = 0')
cursor.execute('''
SELECT /* SLOW_OK */ pl_namespace, pl_title, COUNT(pl_from)
FROM pagelinks %(join)s
WHERE %(cond)s
AND NOT EXISTS (
SELECT * FROM page
WHERE page_title = pl_title
AND page_namespace = pl_namespace
)
GROUP BY pl_namespace, pl_title
ORDER BY COUNT(pl_from) DESC, pl_namespace, pl_title
LIMIT %(limit)d
;
''' % {'join': 'JOIN page AS pfrom ON pl_from = pfrom.page_id' if options.ignore else '',
'cond': 'AND '.join(cond) if cond else '1=1',
'limit': options.limit})
writer = csv.writer(options.output, delimiter='\t')
for col in list(cursor):
writer.writerow(list(col))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment