Last active
June 16, 2021 13:33
-
-
Save bertrandmartel/45bc65e0b34497d6d8831d217679d8a6 to your computer and use it in GitHub Desktop.
wikipedia extract sections and images from wikilinks (https://stackoverflow.com/questions/67959324/how-to-scrape-images-of-a-wikipedia-page-with-labels-indicating-to-which-section)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# try this on : https://replit.com/@bertrandmartel/WikipediaExtractImages | |
import wikitextparser as wtp | |
import requests | |
import urllib.parse | |
r = requests.get( | |
'https://zh.wikipedia.org/w/api.php', | |
params={ | |
'action': 'parse', | |
'page': '蔣中正', | |
'contentmodel': 'wikitext', | |
'prop': 'wikitext', | |
'format': 'json' | |
} | |
) | |
data = wtp.parse(r.json()['parse']['wikitext']['*']) | |
results = [] | |
# store title in chunk of 20 | |
titles = [] | |
titleCount = 0 | |
titleList = [] | |
for section in data.sections: | |
images = [ | |
{ | |
"metadata": t.text.split("|"), | |
"title": t.title[5:], | |
"title_full": t.title | |
} | |
for t in wtp.parse(section.contents).wikilinks | |
if t.title.startswith("File") | |
] | |
for image in images: | |
titles.append(image["title_full"]) | |
titleCount = titleCount + 1 | |
if (titleCount >= 20): | |
titleList.append(titles) | |
titles = [] | |
titleCount = 0 | |
results.append({ | |
"title": section.title, | |
"images": images | |
}) | |
if titleCount != 0: | |
titleList.append(titles) | |
imageMap = {} | |
for chunk in titleList: | |
data = requests.get( | |
f"https://en.wikipedia.org/w/api.php?action=query&titles={urllib.parse.quote_plus('|'.join(chunk))}&prop=imageinfo&iiprop=url&format=json").json() | |
keys = data["query"]["pages"].keys() | |
for idx, k in enumerate(keys): | |
title = data["query"]["pages"][k]["title"] | |
if "imageinfo" in data["query"]["pages"][k]: | |
url = data["query"]["pages"][k]["imageinfo"][0]["url"] | |
imageMap[title.lower()] = url | |
else: | |
print("missing") | |
for section in results: | |
for image in section["images"]: | |
if image["title_full"].lower() in imageMap: | |
image["imageUrl"] = imageMap[image["title_full"].lower()] | |
elif image["title_full"].lower().replace("_", " ") in imageMap: | |
image["imageUrl"] = imageMap[image["title_full"].lower().replace("_", " ")] | |
else: | |
image["imageUrl"] = "not found" | |
print(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment