Last active
January 16, 2016 08:43
-
-
Save whym/1916946 to your computer and use it in GitHub Desktop.
extract red-linked pages with the highest numbers of incoming links (for MediaWiki/Wikimedia)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import fileinput | |
| from datetime import datetime | |
| print '<!-- generated: %s -->' % datetime.strftime(datetime.now(), '%Y-%m-%dT%H:%M:%SZ') | |
| for line in fileinput.input(): | |
| ns,page,n = line.strip().split('\t') | |
| print '# [[%s]] ([[特別:Whatlinkshere/%s|%s 個のリンク]])' % (page, page, n) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # extract red-linked pages with the highest numbers of incoming links (for MediaWiki/Wikimedia) | |
| import oursql | |
| import os | |
| import argparse | |
| import sys | |
| import csv | |
| import re | |
| from collections import namedtuple | |
| from datetime import datetime, timedelta | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-o', '--output', metavar='FILE', | |
| dest='output', type=lambda x: open(x, 'w'), default=sys.stdout, | |
| help='') | |
| parser.add_argument('-l', '--limit', metavar='N', default=4000, | |
| dest='limit', type=int, | |
| help='maximum number of results') | |
| parser.add_argument('-A', '--all-namespaces', default=False, | |
| dest='allns', action='store_true') | |
| parser.add_argument('-d', '--db', metavar='DBNAME', required=True, | |
| dest='db', type=str, | |
| help='target wiki name') | |
| parser.add_argument('-I', '--ignore', metavar='PATTERN', default='', | |
| dest='ignore', type=str, | |
| help='ignore incoming links from these pages') | |
| options = parser.parse_args() | |
| host = options.db.replace('_p', '') + '.labsdb' | |
| conn = oursql.connect(host = host, | |
| read_default_file=os.path.expanduser('~/.my.cnf'), | |
| db = options.db.replace('-','_'), | |
| charset=None, | |
| use_unicode=False) | |
| cursor = conn.cursor() | |
| cond = [] | |
| if options.ignore: | |
| cond.append('pfrom.page_title NOT LIKE "%s"' % options.ignore) | |
| if not options.allns: | |
| cond.append('pl_namespace = 0') | |
| cursor.execute(''' | |
| SELECT /* SLOW_OK */ pl_namespace, pl_title, COUNT(pl_from) | |
| FROM pagelinks %(join)s | |
| WHERE %(cond)s | |
| AND NOT EXISTS ( | |
| SELECT * FROM page | |
| WHERE page_title = pl_title | |
| AND page_namespace = pl_namespace | |
| ) | |
| GROUP BY pl_namespace, pl_title | |
| ORDER BY COUNT(pl_from) DESC, pl_namespace, pl_title | |
| LIMIT %(limit)d | |
| ; | |
| ''' % {'join': 'JOIN page AS pfrom ON pl_from = pfrom.page_id' if options.ignore else '', | |
| 'cond': 'AND '.join(cond) if cond else '1=1', | |
| 'limit': options.limit}) | |
| writer = csv.writer(options.output, delimiter='\t') | |
| for col in list(cursor): | |
| writer.writerow(list(col)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment