Created
July 19, 2024 11:01
-
-
Save geobabbler/5a0a03827792a3f1915897e94416edb5 to your computer and use it in GitHub Desktop.
Script to download sample images from Wikimedia Commons
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
qry = "glacier" #wikimedia commons query | |
# Define the search URL | |
search_url = f"https://commons.wikimedia.org/w/index.php?search={qry}&title=Special:MediaSearch&go=Go&type=image" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
} | |
# Function to download an image | |
def download_image(image_url, save_path): | |
try: | |
response = requests.get(image_url, headers=headers, stream=True) | |
response.raise_for_status() | |
with open(save_path, 'wb') as file: | |
for chunk in response.iter_content(chunk_size=8192): | |
file.write(chunk) | |
print(f"Downloaded {save_path}") | |
except Exception as e: | |
print(f"Failed to download {image_url}. Error: {e}") | |
# Function to get image URLs from Wikimedia Commons search results | |
def get_image_urls(search_url, max_images=10): | |
response = requests.get(search_url, headers=headers) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
image_tags = soup.find_all('img', {'class': 'sd-image'}, limit=max_images) | |
image_urls = [img['src'] for img in image_tags] | |
return image_urls | |
# Directory to save images | |
save_dir = "sample_images" | |
os.makedirs(save_dir, exist_ok=True) | |
# Get image URLs | |
image_urls = get_image_urls(search_url) | |
# Download images | |
for idx, image_url in enumerate(image_urls): | |
save_path = os.path.join(save_dir, f"{qry}_{idx+1}.jpg") | |
download_image(image_url, save_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment