-
-
Save macloo/3c85055d2c9ab6293cce582f480312b6 to your computer and use it in GitHub Desktop.
""" | |
Find the main image on a Wikipedia page and download it. | |
Using a list of Wikipedia URLs, download the main image from each page in the list. | |
Name the downloaded file to match the page URL. | |
""" | |
import requests, os | |
# set the folder name where images will be stored | |
my_folder = 'wiki_images' | |
# create the folder in the current working directory | |
# in which to store the downloaded images | |
os.makedirs(my_folder, exist_ok=True) | |
# front part of each Wikipedia URL | |
base_url = 'https://en.wikipedia.org/wiki/' | |
# partial URLs for each desired Wikipedia page | |
my_list = ['Anaea_troglodyta', | |
'Colias_eurytheme', | |
'Euphilotes_battoides', | |
'Great_spangled_fritillary', | |
'Papilio_troilus'] | |
# Wikipedia API query string to get the main image on a page | |
# (partial URL will be added to the end) | |
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' | |
# get JSON data w/ API and extract image URL | |
def get_image_url(partial_url): | |
try: | |
api_res = requests.get(query + partial_url).json() | |
first_part = api_res['query']['pages'] | |
# this is a way around not knowing the article id number | |
for key, value in first_part.items(): | |
if (value['original']['source']): | |
data = value['original']['source'] | |
return data | |
except Exception as exc: | |
print(exc) | |
print("Partial URL: " + partial_url) | |
data = None | |
return data | |
# download one image with URL obtained from API | |
def download_image(the_url, the_page): | |
res = requests.get(the_url) | |
res.raise_for_status() | |
# get original file extension for image | |
# by splitting on . and getting the final segment | |
file_ext = '.' + the_url.split('.')[-1].lower() | |
# save the image to folder - binary file - with desired filename | |
image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb') | |
# download the image file | |
# HT to Automate the Boring Stuff with Python, chapter 12 | |
for chunk in res.iter_content(100000): | |
image_file.write(chunk) | |
image_file.close() | |
# loop to download main image for each page in list | |
counter = 1 | |
for the_page in my_list: | |
# get JSON data and extract image URL | |
the_url = get_image_url(the_page) | |
# if the URL is not None ... | |
if (the_url): | |
# tell us where we are for the heck of it | |
print("Downloading image " + str(counter)) | |
# download that image | |
download_image(the_url, the_page) | |
else: | |
print("No image file for " + the_page) | |
counter += 1 | |
print("All done!") |
Im getting a blocked 403 agent as needs to comply with https://meta.wikimedia.org/wiki/User-Agent_policy#Python now I guess?
@nuganics I'll bet adding the header shown there will fix the script. I will try it when I have time. Thanks for pointing out the error.
@macloo Thank you for the reply. I don't know Python but I just read your Python course chapters 1 to 3. I added headers = {'User-Agent': 'CoolBot/0.0 (https://example.org/coolbot/; [email protected])'}
to line 18 under the base_url line. Line 49 res = requests.get(the_url)
I changed to res = requests.get(the_url, headers=headers)
It seems to work.
I learnt Pascal in school and know Excel a little but got there with your tutorial so thank you :)
I used a Chrome extension (Link Klipper) to get wikipedia URLs then Excel to extract the pages with =TRIM(RIGHT(SUBSTITUTE(A1,"/",REPT(" ",255)),255)) and ="'"&TEXTJOIN("', '",1,F1:F1000)&"'" to join them for use in your script.
@nuganics You shouldn't use 'CoolBot/0.0 (https://example.org/coolbot/; [email protected])'
but instead edit that to identify yourself. It is a way of saying: "Hi, I am not evil. Here's my identification." That is explained at https://meta.wikimedia.org/wiki/User-Agent_policy#Python — but maybe not explained clearly enough
That screenshot (previous comment) probably needs more explanation.
If you access the API with your web browser, you would put this into the address bar and Return/Enter:
https://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=Nathalis_iole
What you'd get back, in the browser window, would look like what is in the screenshot. Nathalis_iole (at the end of that string) is the filename for this Wikipedia page:
https://en.wikipedia.org/wiki/Nathalis_iole
In the Python script, you use that query string, NOT in the browser window, but directly through the Python code. As a result, you can loop over MANY filenames and download LOTS of images.
The API returns a JSON-formatted object, so you need to extract the relevant URL (for the image file) from that JSON. That is what's happening in lines 33–39 in the Python script. The JSON is being read, as a Python dictionary would be read. The code is getting one thing: the image URL.
After you've got the image URL standing alone, you can download the image and save it on your computer.