Skip to content

Instantly share code, notes, and snippets.

@geobabbler
Created July 19, 2024 11:01
Show Gist options
  • Save geobabbler/5a0a03827792a3f1915897e94416edb5 to your computer and use it in GitHub Desktop.
Save geobabbler/5a0a03827792a3f1915897e94416edb5 to your computer and use it in GitHub Desktop.
Script to download sample images from Wikimedia Commons
import requests
from bs4 import BeautifulSoup
import os
qry = "glacier" #wikimedia commons query
# Define the search URL
search_url = f"https://commons.wikimedia.org/w/index.php?search={qry}&title=Special:MediaSearch&go=Go&type=image"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# Function to download an image
def download_image(image_url, save_path):
try:
response = requests.get(image_url, headers=headers, stream=True)
response.raise_for_status()
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"Downloaded {save_path}")
except Exception as e:
print(f"Failed to download {image_url}. Error: {e}")
# Function to get image URLs from Wikimedia Commons search results
def get_image_urls(search_url, max_images=10):
response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
image_tags = soup.find_all('img', {'class': 'sd-image'}, limit=max_images)
image_urls = [img['src'] for img in image_tags]
return image_urls
# Directory to save images
save_dir = "sample_images"
os.makedirs(save_dir, exist_ok=True)
# Get image URLs
image_urls = get_image_urls(search_url)
# Download images
for idx, image_url in enumerate(image_urls):
save_path = os.path.join(save_dir, f"{qry}_{idx+1}.jpg")
download_image(image_url, save_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment