Skip to content

Instantly share code, notes, and snippets.

@milimetric
Last active August 8, 2016 21:35
Show Gist options
  • Select an option

  • Save milimetric/fd14bcf6131c676fd11db30d19e32a0a to your computer and use it in GitHub Desktop.

Select an option

Save milimetric/fd14bcf6131c676fd11db30d19e32a0a to your computer and use it in GitHub Desktop.
Map localized namespace prefixes to namespace integer codes for all (public) wikimedia projects.
import requests
import json
import csv
headers = {
'User-Agent': 'Bot Dan Andreescu parsing Special:PrefixIndex pages',
'From': 'dandreescu@wikimedia.org'
}
def get_wikis():
matrix = requests.get(
'https://www.mediawiki.org/w/api.php?action=sitematrix'
'&smsiteprop=url|dbname|code'
'&smstate=all'
'&format=json').json().get('sitematrix', {})
wikis = [
wiki
for language in matrix.values()
if type(language) is dict and 'site' in language
for wiki in language['site']
] + [
wiki
for wiki in matrix.get('specials', [])
]
return [
wiki
for wiki in wikis
if 'private' not in wiki and 'closed' not in wiki
]
wikis = get_wikis()
namespaceDictionary = {}
query = '/w/api.php?action=query&format=json&meta=siteinfo&siprop=namespaces'
###
# Writes mapping as: (hostname, dbname, ns integer, ns canonical, ns localized)
# hostname : ja.wikipedia.org
# dbname : jawiki
# ns integer : 2, 100, etc.
# ns canonical : the english prefix if exists, otherwise the localized prefix
# ns localized : the localized prefix
###
with open('namespace.dictionary.csv', 'wb') as w:
spamwriter = csv.writer(w)
for wiki in wikis:
site = wiki.get('url', '')
host = site.replace('https://', '')
dbname = wiki.get('dbname', host)
try:
r = requests.get(site + query, headers=headers)
ns = json.loads(r.text)['query']['namespaces']
for k, v in ns.items():
row = [host, dbname, k, v.get('canonical', ''), v.get('*', '')]
spamwriter.writerow([unicode(s).encode("utf-8") for s in row])
except Exception, e:
print '*************************'
print site + ' FAILED!!! with ' + str(e)
print '*************************'
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment