Created
January 30, 2024 04:01
-
-
Save erictleung/18ba6e8160ac8f8a51bce55bdb0e70f5 to your computer and use it in GitHub Desktop.
Help audit, remove, and update musician infoboxes on Wikipedia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Help audit, remove, and update musician infoboxes. | |
https://en.wikipedia.org/wiki/Category:Pages_using_infobox_musical_artist_with_associated_acts | |
""" | |
import re | |
import requests | |
import webbrowser | |
import mwparserfromhell | |
from bs4 import BeautifulSoup as bs | |
# pylint: disable=line-too-long | |
response = requests.get( | |
url="https://en.wikipedia.org/wiki/Category:Pages_using_infobox_musical_artist_with_associated_acts", # noqa: E501 | |
timeout=10, | |
) | |
soup = bs(response.content, "html.parser") | |
# Extract only links from the "Pages in category" and not unnecessary Wiki | |
# links | |
all_pages = soup.find("div", class_="mw-category-group").find_all("a") | |
# Extract text that can be inserted into a Wikipedia URL to get the page | |
music_pages = [] | |
p = re.compile(r"^\/wiki\/([A-Za-z_()%0-9]*)") | |
for link in all_pages: | |
if p.match(link.get("href")) is not None: | |
href = link.get("href") | |
music_pages.append(p.match(href).group(1)) | |
print(f"Parsed {len(music_pages)} pages.") | |
# Setup to extract the raw Wikitext | |
wiki_base = "https://en.wikipedia.org/w/index.php?title=" | |
wiki_end = "&action=raw&ctype=text" | |
search = wiki_base + music_pages[0] + wiki_end | |
print(f"Searching for {search}") | |
response = requests.get(search, timeout=10) | |
wikicode = mwparserfromhell.parse(response.content) | |
# Pseudocode: | |
# Loop through initial pages | |
# Extract associated_acts value | |
# Extract Wikilinks and find their pages | |
# Go and extract past members and former members from Infobox | |
# Open up page | |
# Print out copy-paste information for new values of: | |
# - current_member_of= | |
# - past_member_of= | |
# - spinoff_of= | |
# - spinoffs= | |
# Pause to make manual changes | |
for template in wikicode.filter_templates(): | |
if template.name.matches("Infobox musical artist"): | |
print("Getting the associated_acts= values:") | |
print(template.get("associated_acts").value.filter_wikilinks()) | |
ac = template.get("associated_acts").value.filter_wikilinks() | |
for link in ac: | |
title = link.title | |
webbrowser.open() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment