Skip to content

Instantly share code, notes, and snippets.

@sbassett29
Last active February 4, 2020 21:52
Show Gist options
  • Save sbassett29/76a45c5c16d5e0bcaa1174b44ff5b5e0 to your computer and use it in GitHub Desktop.
Save sbassett29/76a45c5c16d5e0bcaa1174b44ff5b5e0 to your computer and use it in GitHub Desktop.
Search for various domains and urls in article references on Wikimedia projects
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Search for various ref domains and urls on Wikimedia projects
Author: sbassett29
License: CC0
Usage examples:
./searchRefs.py example.com
./searchRefs.py example.com/some/url
./searchRefs.py https://example.com/some/?arg=1
./searchRefs.py -p es.wikipedia.org example.com [default: enwiki]
Prints a list of found Article#cite-note urls.
"""
import argparse
import re
import requests
import urllib.parse
""" cli args/control """
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--project', help='A valid Wikimedia project \
(e.g. en.wikipedia)',
default="en.wikipedia.org",
type=str)
parser.add_argument('search_term',
help='A search term, typically a domain or url.',
type=str)
args, unknown = parser.parse_known_args()
""" api requests """
session = requests.Session()
base_site = ''.join(["https://", args.project])
search_api_url = ''.join([base_site, "/w/api.php"])
search_term = args.search_term
search_regexp = ''.join(["insource:|", re.escape(search_term), "|"])
search_params = {
"action": "query",
"format": "json",
"list": "search",
"srwhat": "text",
"srprop": "",
"srlimit": 200,
"srsearch": search_regexp
}
rest_api_page_ref_url = ''.join([base_site, "/api/rest_v1/page/references/"])
req_search = session.get(url=search_api_url, params=search_params)
for page_data in req_search.json()['query']['search']:
url = ''.join([rest_api_page_ref_url, page_data['title']])
req_refs = session.get(url=url)
if 'references_by_id' in req_refs.json():
refs_data = req_refs.json()['references_by_id']
for ref in refs_data:
if refs_data[ref]['content']['html'].find(search_term) >= 0:
print(''.join([base_site,
"/wiki/",
urllib.parse.quote(page_data['title']),
"#",
ref]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment