Created
February 10, 2023 16:18
-
-
Save aimerneige/bf3e660c9a722fada15cad0326fde3a5 to your computer and use it in GitHub Desktop.
Download all tarot images from Wikipedia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| # Author: AimerNeige | |
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| out_dir = "./tarot" | |
| url = 'https://en.m.wikipedia.org/wiki/Rider%E2%80%93Waite_Tarot' | |
| html = requests.get(url).text | |
| soup = BeautifulSoup(html, 'html.parser') | |
| gallery_list = [] | |
| for ul in soup.find_all("ul"): | |
| try: | |
| if 'gallery' in ul['class']: | |
| gallery_list.append(ul) | |
| except: | |
| continue | |
| gallery_box_list = [] | |
| for gallery in gallery_list: | |
| for li in gallery.find_all("li"): | |
| try: | |
| if "gallerybox" in li['class']: | |
| gallery_box_list.append(li) | |
| except: | |
| continue | |
| image_list = [] | |
| for gallery_box in gallery_box_list: | |
| for a in gallery_box.find_all('a'): | |
| try: | |
| if 'image' in a['class']: | |
| image_list.append(a) | |
| except: | |
| continue | |
| tarot_image_list = [] | |
| for image in image_list: | |
| if image['href'] == "": | |
| continue | |
| title = image['title'] | |
| href = str.format('https://en.m.wikipedia.org{}', image['href']) | |
| tarot_image_list.append({ | |
| "title": title, | |
| "href": href, | |
| }) | |
| for tarot_image in tarot_image_list: | |
| image_title = tarot_image['title'] | |
| print(str.format("Start downlaod {}", image_title)) | |
| tarot_image_html = requests.get(tarot_image['href']).text | |
| tarot_image_soup = BeautifulSoup(tarot_image_html, "html.parser") | |
| media_div_list = [] | |
| for div in tarot_image_soup.find_all('div'): | |
| try: | |
| if "fullMedia" in div['class']: | |
| media_div_list.append(div) | |
| except: | |
| continue | |
| if len(media_div_list) == 0: | |
| print("Warning, fail to find the download link of", image_title) | |
| continue | |
| for media_div in media_div_list: | |
| for media_a in media_div.find_all('a'): | |
| image_href = str.format('https:{}', media_a['href']) | |
| try: | |
| image_data = requests.get(image_href).content | |
| image_name = image_href.split('/')[-1] | |
| image_path = os.path.join(out_dir, image_name) | |
| with open(image_path, 'wb') as img_f: | |
| img_f.write(image_data) | |
| except: | |
| print("Warning, fail to download", image_name) | |
| continue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment