-
-
Save macloo/3c85055d2c9ab6293cce582f480312b6 to your computer and use it in GitHub Desktop.
""" | |
Find the main image on a Wikipedia page and download it. | |
Using a list of Wikipedia URLs, download the main image from each page in the list. | |
Name the downloaded file to match the page URL. | |
""" | |
import requests, os | |
# set the folder name where images will be stored | |
my_folder = 'wiki_images' | |
# create the folder in the current working directory | |
# in which to store the downloaded images | |
os.makedirs(my_folder, exist_ok=True) | |
# front part of each Wikipedia URL | |
base_url = 'https://en.wikipedia.org/wiki/' | |
# partial URLs for each desired Wikipedia page | |
my_list = ['Anaea_troglodyta', | |
'Colias_eurytheme', | |
'Euphilotes_battoides', | |
'Great_spangled_fritillary', | |
'Papilio_troilus'] | |
# Wikipedia API query string to get the main image on a page | |
# (partial URL will be added to the end) | |
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' | |
# get JSON data w/ API and extract image URL | |
def get_image_url(partial_url): | |
try: | |
api_res = requests.get(query + partial_url).json() | |
first_part = api_res['query']['pages'] | |
# this is a way around not knowing the article id number | |
for key, value in first_part.items(): | |
if (value['original']['source']): | |
data = value['original']['source'] | |
return data | |
except Exception as exc: | |
print(exc) | |
print("Partial URL: " + partial_url) | |
data = None | |
return data | |
# download one image with URL obtained from API | |
def download_image(the_url, the_page): | |
res = requests.get(the_url) | |
res.raise_for_status() | |
# get original file extension for image | |
# by splitting on . and getting the final segment | |
file_ext = '.' + the_url.split('.')[-1].lower() | |
# save the image to folder - binary file - with desired filename | |
image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb') | |
# download the image file | |
# HT to Automate the Boring Stuff with Python, chapter 12 | |
for chunk in res.iter_content(100000): | |
image_file.write(chunk) | |
image_file.close() | |
# loop to download main image for each page in list | |
counter = 1 | |
for the_page in my_list: | |
# get JSON data and extract image URL | |
the_url = get_image_url(the_page) | |
# if the URL is not None ... | |
if (the_url): | |
# tell us where we are for the heck of it | |
print("Downloading image " + str(counter)) | |
# download that image | |
download_image(the_url, the_page) | |
else: | |
print("No image file for " + the_page) | |
counter += 1 | |
print("All done!") |
@macloo Thank you for the reply. I don't know Python but I just read your Python course chapters 1 to 3. I added headers = {'User-Agent': 'CoolBot/0.0 (https://example.org/coolbot/; [email protected])'}
to line 18 under the base_url line. Line 49 res = requests.get(the_url)
I changed to res = requests.get(the_url, headers=headers)
It seems to work.
I learnt Pascal in school and know Excel a little but got there with your tutorial so thank you :)
I used a Chrome extension (Link Klipper) to get wikipedia URLs then Excel to extract the pages with =TRIM(RIGHT(SUBSTITUTE(A1,"/",REPT(" ",255)),255)) and ="'"&TEXTJOIN("', '",1,F1:F1000)&"'" to join them for use in your script.
@nuganics You shouldn't use 'CoolBot/0.0 (https://example.org/coolbot/; [email protected])'
but instead edit that to identify yourself. It is a way of saying: "Hi, I am not evil. Here's my identification." That is explained at https://meta.wikimedia.org/wiki/User-Agent_policy#Python — but maybe not explained clearly enough
@nuganics I'll bet adding the header shown there will fix the script. I will try it when I have time. Thanks for pointing out the error.