Skip to content

Instantly share code, notes, and snippets.

@aimerneige
Created February 10, 2023 16:18
Show Gist options
  • Select an option

  • Save aimerneige/bf3e660c9a722fada15cad0326fde3a5 to your computer and use it in GitHub Desktop.

Select an option

Save aimerneige/bf3e660c9a722fada15cad0326fde3a5 to your computer and use it in GitHub Desktop.
Download all tarot images from Wikipedia
# -*- coding: utf-8 -*-
# Author: AimerNeige
import os
import requests
from bs4 import BeautifulSoup
out_dir = "./tarot"
url = 'https://en.m.wikipedia.org/wiki/Rider%E2%80%93Waite_Tarot'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
gallery_list = []
for ul in soup.find_all("ul"):
try:
if 'gallery' in ul['class']:
gallery_list.append(ul)
except:
continue
gallery_box_list = []
for gallery in gallery_list:
for li in gallery.find_all("li"):
try:
if "gallerybox" in li['class']:
gallery_box_list.append(li)
except:
continue
image_list = []
for gallery_box in gallery_box_list:
for a in gallery_box.find_all('a'):
try:
if 'image' in a['class']:
image_list.append(a)
except:
continue
tarot_image_list = []
for image in image_list:
if image['href'] == "":
continue
title = image['title']
href = str.format('https://en.m.wikipedia.org{}', image['href'])
tarot_image_list.append({
"title": title,
"href": href,
})
for tarot_image in tarot_image_list:
image_title = tarot_image['title']
print(str.format("Start downlaod {}", image_title))
tarot_image_html = requests.get(tarot_image['href']).text
tarot_image_soup = BeautifulSoup(tarot_image_html, "html.parser")
media_div_list = []
for div in tarot_image_soup.find_all('div'):
try:
if "fullMedia" in div['class']:
media_div_list.append(div)
except:
continue
if len(media_div_list) == 0:
print("Warning, fail to find the download link of", image_title)
continue
for media_div in media_div_list:
for media_a in media_div.find_all('a'):
image_href = str.format('https:{}', media_a['href'])
try:
image_data = requests.get(image_href).content
image_name = image_href.split('/')[-1]
image_path = os.path.join(out_dir, image_name)
with open(image_path, 'wb') as img_f:
img_f.write(image_data)
except:
print("Warning, fail to download", image_name)
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment