Last active
February 14, 2020 22:01
-
-
Save sbassett29/e0fd288eb03c43e85d8d383647d543f6 to your computer and use it in GitHub Desktop.
Some quick python3 to search a wikimedia project article title (single language) across a given project class (wikipedia, wiktionary, etc.)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" Article title check across Wikimedia projects | |
Author: sbassett29 | |
License: CC0 | |
""" | |
import argparse | |
import json | |
import requests | |
import sys | |
""" arguments """ | |
parser = argparse.ArgumentParser() | |
parser.add_argument('project', help='A valid Wikimedia project type \ | |
(e.g. wikipedia)', | |
type=str) | |
parser.add_argument('title', help='An article or page title to search \ | |
(e.g. Cat)', | |
type=str) | |
args, unknown = parser.parse_known_args() | |
""" query sitematrix and build project matches list """ | |
sm_api_url = ('https://meta.wikimedia.org/w/api.php?action=sitematrix' | |
'&formatversion=2&format=json') | |
sm_api_json = {} | |
resp = requests.get(sm_api_url) | |
if resp.status_code != 200: | |
print("Response Error, status code = {}".format( | |
resp.status_code)) | |
sys.exit(1) | |
sm_api_json = json.loads(resp.text)['sitematrix'] | |
project_matches = [] | |
for item in sm_api_json: | |
if (isinstance(sm_api_json[item], dict) and | |
isinstance(sm_api_json[item]['site'], list) and | |
len(sm_api_json[item]['site'])): | |
for index in range(len(sm_api_json[item]['site'])): | |
if (args.project == 'all' or | |
sm_api_json[item]['site'][index]['url'].find( | |
args.project) > -1): | |
project_matches.append( | |
sm_api_json[item]['site'][index]['url']) | |
elif (isinstance(sm_api_json[item], list)): | |
for specials in sm_api_json[item]: | |
if (isinstance(specials, dict) and | |
'url' in specials.keys()): | |
if (args.project == 'all' or specials['url'].find( | |
args.project) > -1): | |
project_matches.append(specials['url']) | |
""" attempt to get article title for each project """ | |
for project_url in project_matches: | |
u = ''.join([project_url, '/wiki/', args.title]) | |
resp = requests.get(u) | |
if (resp.status_code == 200): | |
print('Article "{}" found at: {}'.format(args.title, u)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example usage:
./WikimediaArticleSearch.py wikipedia Cat
./WikimediaArticleSearch.py wiktionary Dog
./WikimediaArticleSearch.py all Elephant