Skip to content

Instantly share code, notes, and snippets.

@sbassett29
Last active February 14, 2020 22:01
Show Gist options
  • Save sbassett29/e0fd288eb03c43e85d8d383647d543f6 to your computer and use it in GitHub Desktop.
Save sbassett29/e0fd288eb03c43e85d8d383647d543f6 to your computer and use it in GitHub Desktop.
Some quick python3 to search a wikimedia project article title (single language) across a given project class (wikipedia, wiktionary, etc.)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Article title check across Wikimedia projects
Author: sbassett29
License: CC0
"""
import argparse
import json
import requests
import sys
""" arguments """
parser = argparse.ArgumentParser()
parser.add_argument('project', help='A valid Wikimedia project type \
(e.g. wikipedia)',
type=str)
parser.add_argument('title', help='An article or page title to search \
(e.g. Cat)',
type=str)
args, unknown = parser.parse_known_args()
""" query sitematrix and build project matches list """
sm_api_url = ('https://meta.wikimedia.org/w/api.php?action=sitematrix'
'&formatversion=2&format=json')
sm_api_json = {}
resp = requests.get(sm_api_url)
if resp.status_code != 200:
print("Response Error, status code = {}".format(
resp.status_code))
sys.exit(1)
sm_api_json = json.loads(resp.text)['sitematrix']
project_matches = []
for item in sm_api_json:
if (isinstance(sm_api_json[item], dict) and
isinstance(sm_api_json[item]['site'], list) and
len(sm_api_json[item]['site'])):
for index in range(len(sm_api_json[item]['site'])):
if (args.project == 'all' or
sm_api_json[item]['site'][index]['url'].find(
args.project) > -1):
project_matches.append(
sm_api_json[item]['site'][index]['url'])
elif (isinstance(sm_api_json[item], list)):
for specials in sm_api_json[item]:
if (isinstance(specials, dict) and
'url' in specials.keys()):
if (args.project == 'all' or specials['url'].find(
args.project) > -1):
project_matches.append(specials['url'])
""" attempt to get article title for each project """
for project_url in project_matches:
u = ''.join([project_url, '/wiki/', args.title])
resp = requests.get(u)
if (resp.status_code == 200):
print('Article "{}" found at: {}'.format(args.title, u))
@sbassett29
Copy link
Author

sbassett29 commented Sep 26, 2019

Example usage:
./WikimediaArticleSearch.py wikipedia Cat
./WikimediaArticleSearch.py wiktionary Dog
./WikimediaArticleSearch.py all Elephant

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment