Created
March 12, 2012 11:46
-
-
Save whym/2021350 to your computer and use it in GitHub Desktop.
extract shortest pages (for MediaWiki/Wikimedia)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import fileinput | |
import sys | |
import json | |
import urllib2 | |
import argparse | |
from datetime import datetime | |
def format_date(x): | |
return datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ') | |
def parse_date(x): | |
return datetime.strptime(x, '%Y%m%d%H%M%S') | |
def format_wikitext(lines): | |
yield '{|class="wikitable sortable"' | |
yield '! Page !! Bytes !! Timestamp' | |
for line in lines: | |
a = line.strip().split('\t') | |
ns,page,size,timestamp,oldid = a | |
yield '|-' | |
yield '| [[%s]] || %s || [//{{SERVERNAME}}/{{SCRIPTPATH}}/index.php?oldid=%s %s]' % (page, size, oldid, format_date(parse_date(timestamp))) | |
yield '|}' | |
def render(text, script): | |
url = script | |
data ='format=json&action=parse&text=%s' % (urllib2.quote(text)) | |
while True: | |
try: | |
#print >>sys.stderr, 'fetching %s' % url | |
res = urllib2.urlopen(urllib2.Request(url, | |
data=data, | |
headers={'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'})).read() | |
break | |
except urllib2.URLError, e: | |
print >>sys.stderr, e | |
exit(1) | |
return json.loads(res.decode('utf-8'))['parse']['text']['*'] | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-f', '--format', choices=['wikitext','html'], | |
dest='format', default='wikitext', | |
help='') | |
parser.add_argument('-s', '--site', metavar='ADDRESS', | |
dest='site', type=str, default='ja.wikipedia.org', | |
help='target wiki name') | |
parser.add_argument('infile', | |
nargs='?', type=argparse.FileType('r'), default=sys.stdin) | |
options = parser.parse_args() | |
formatted = format_wikitext(options.infile) | |
if options.format == 'wikitext': | |
print '<!-- generated: %s -->' % format_date(datetime.now()) | |
for line in formatted: | |
print line | |
else: | |
header = ''' | |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
<html lang="ja" dir="ltr" class="client-nojs" xmlns="http://www.w3.org/1999/xhtml"> | |
<base href="//%(site)s" /> | |
<head> | |
<title>jawiki shortpages</title> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | |
<meta http-equiv="Content-Style-Type" content="text/css" /> | |
<link rel="copyright" href="//creativecommons.org/licenses/by-sa/3.0/" /> | |
<link rel="stylesheet" href="//bits.wikimedia.org/%(site)s/load.php?debug=false&lang=ja&modules=site&only=styles&skin=vector&*" type="text/css" media="all" /> | |
<style type="text/css" media="all">a:lang(ar),a:lang(ckb),a:lang(fa),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}a.new,#quickbar a.new{color:#ba0000} | |
</style> | |
<script src="//bits.wikimedia.org/%(site)s/load.php?debug=false&lang=ja&modules=startup&only=scripts&skin=vector&*" type="text/javascript"></script> | |
<script type="text/javascript" src="//toolserver.org/~whym/jquery/jquery-latest.js"></script> | |
<script type="text/javascript" src="//toolserver.org/~whym/jquery/jquery.tablesorter.js"></script> | |
<script type="text/javascript"> | |
$(document).ready(function() | |
{ | |
$(".sortable").tablesorter(); | |
} | |
); | |
</script> | |
</head> | |
<body class="mediawiki ltr sitedir-ltr ns-0 ns-subject skin-vector action-view">''' | |
footer = ''' | |
<!-- /footer --> | |
<script type="text/javascript">if(window.mw){ | |
mw.loader.load(["mediawiki.user","mediawiki.page.ready","mediawiki.legacy.mwsuggest","ext.vector.collapsibleNav","ext.vector.collapsibleTabs","ext.vector.editWarning","ext.vector.simpleSearch"], null, true); | |
}</script> | |
</body> | |
</html> | |
''' | |
header = header % {'site': options.site} | |
script = 'http://%s/w/api.php' % options.site | |
print header | |
print '<!-- generated: %s -->' % format_date(datetime.now()) | |
print render("\n".join(formatted), script).encode('utf-8') | |
print footer |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# extract shortest pages (for MediaWiki/Wikimedia) | |
import oursql | |
import os | |
import argparse | |
import sys | |
import csv | |
import re | |
from collections import namedtuple | |
from datetime import datetime, timedelta | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-o', '--output', metavar='FILE', | |
dest='output', type=lambda x: open(x, 'w'), default=sys.stdout, | |
help='') | |
parser.add_argument('-l', '--limit', metavar='N', default=4000, | |
dest='limit', type=int, | |
help='maximum number of results') | |
parser.add_argument('-s', '--size', metavar='N', default=128, | |
dest='len', type=int, | |
help='maximum page size') | |
parser.add_argument('-d', '--db', metavar='DBNAME', required=True, | |
dest='db', type=str, | |
help='target wiki name') | |
parser.add_argument('-P','--ignore-pages', metavar='PATTERN', default=[], | |
dest='ignorep', action='append', | |
help='ignore these pages') | |
parser.add_argument('-C', '--ignore-categories', metavar='PATTERN', default=[], | |
dest='ignorec', action='append', | |
help='ignore pages in these categories') | |
options = parser.parse_args() | |
options.db = options.db.replace('_','-') | |
host = options.db + '.rrdb.toolserver.org' | |
conn = oursql.connect(host = host, | |
read_default_file=os.path.expanduser('~/.my.cnf'), | |
db = options.db.replace('-','_'), | |
charset=None, | |
use_unicode=False) | |
cursor = conn.cursor() | |
condpages = ' AND '.join(['page_title not like "%s"' % x for x in options.ignorep]) | |
condcats = ' AND '.join(['cl_to not like "%s"' % x for x in options.ignorec]) | |
cursor.execute(''' | |
SELECT page_namespace, page_title, page_len, rev_timestamp, rev_id FROM page LEFT JOIN revision | |
ON page_latest = rev_id | |
WHERE page_namespace = 0 | |
AND page_is_redirect = 0 | |
AND %(condpages)s | |
AND page_len < %(len)s | |
AND NOT EXISTS ( SELECT cl_from FROM categorylinks | |
WHERE cl_from = page_id | |
AND NOT (%(condcats)s) ) | |
LIMIT %(limit)s | |
; | |
''' % {'condpages': condpages if len(condpages) > 0 else '1', | |
'condcats': condcats if len(condcats) > 0 else '1', | |
'len': options.len, | |
'limit': options.limit}) | |
writer = csv.writer(options.output, delimiter='\t') | |
for col in list(cursor): | |
writer.writerow(list(col)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment