Last active
February 4, 2020 21:52
-
-
Save sbassett29/76a45c5c16d5e0bcaa1174b44ff5b5e0 to your computer and use it in GitHub Desktop.
Search for various domains and urls in article references on Wikimedia projects
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" Search for various ref domains and urls on Wikimedia projects | |
Author: sbassett29 | |
License: CC0 | |
Usage examples: | |
./searchRefs.py example.com | |
./searchRefs.py example.com/some/url | |
./searchRefs.py https://example.com/some/?arg=1 | |
./searchRefs.py -p es.wikipedia.org example.com [default: enwiki] | |
Prints a list of found Article#cite-note urls. | |
""" | |
import argparse | |
import re | |
import requests | |
import urllib.parse | |
""" cli args/control """ | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-p', '--project', help='A valid Wikimedia project \ | |
(e.g. en.wikipedia)', | |
default="en.wikipedia.org", | |
type=str) | |
parser.add_argument('search_term', | |
help='A search term, typically a domain or url.', | |
type=str) | |
args, unknown = parser.parse_known_args() | |
""" api requests """ | |
session = requests.Session() | |
base_site = ''.join(["https://", args.project]) | |
search_api_url = ''.join([base_site, "/w/api.php"]) | |
search_term = args.search_term | |
search_regexp = ''.join(["insource:|", re.escape(search_term), "|"]) | |
search_params = { | |
"action": "query", | |
"format": "json", | |
"list": "search", | |
"srwhat": "text", | |
"srprop": "", | |
"srlimit": 200, | |
"srsearch": search_regexp | |
} | |
rest_api_page_ref_url = ''.join([base_site, "/api/rest_v1/page/references/"]) | |
req_search = session.get(url=search_api_url, params=search_params) | |
for page_data in req_search.json()['query']['search']: | |
url = ''.join([rest_api_page_ref_url, page_data['title']]) | |
req_refs = session.get(url=url) | |
if 'references_by_id' in req_refs.json(): | |
refs_data = req_refs.json()['references_by_id'] | |
for ref in refs_data: | |
if refs_data[ref]['content']['html'].find(search_term) >= 0: | |
print(''.join([base_site, | |
"/wiki/", | |
urllib.parse.quote(page_data['title']), | |
"#", | |
ref])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment