geobabbler · July 19, 2024 11:01
diff --git a/scrape_photos.py b/scrape_photos.py
 import requests
 from bs4 import BeautifulSoup
 import os

 qry = "glacier" #wikimedia commons query
 # Define the search URL
 search_url = f"https://commons.wikimedia.org/w/index.php?search={qry}&title=Special:MediaSearch&go=Go&type=image"

 headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 }
 # Function to download an image
 def download_image(image_url, save_path):
    try:
        response = requests.get(image_url, headers=headers, stream=True)
        response.raise_for_status()
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded {save_path}")
    except Exception as e:
        print(f"Failed to download {image_url}. Error: {e}")

 # Function to get image URLs from Wikimedia Commons search results
 def get_image_urls(search_url, max_images=10):
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    image_tags = soup.find_all('img', {'class': 'sd-image'}, limit=max_images)
    image_urls = [img['src'] for img in image_tags]
    return image_urls

 # Directory to save images
 save_dir = "sample_images"
 os.makedirs(save_dir, exist_ok=True)

 # Get image URLs
 image_urls = get_image_urls(search_url)

 # Download images
 for idx, image_url in enumerate(image_urls):
    save_path = os.path.join(save_dir, f"{qry}_{idx+1}.jpg")
    download_image(image_url, save_path)
	import requests
	from bs4 import BeautifulSoup
	import os

	qry = "glacier" #wikimedia commons query
	# Define the search URL
	search_url = f"https://commons.wikimedia.org/w/index.php?search={qry}&title=Special:MediaSearch&go=Go&type=image"

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	}
	# Function to download an image
	def download_image(image_url, save_path):
	try:
	response = requests.get(image_url, headers=headers, stream=True)
	response.raise_for_status()
	with open(save_path, 'wb') as file:
	for chunk in response.iter_content(chunk_size=8192):
	file.write(chunk)
	print(f"Downloaded {save_path}")
	except Exception as e:
	print(f"Failed to download {image_url}. Error: {e}")

	# Function to get image URLs from Wikimedia Commons search results
	def get_image_urls(search_url, max_images=10):
	response = requests.get(search_url, headers=headers)
	soup = BeautifulSoup(response.text, 'html.parser')
	image_tags = soup.find_all('img', {'class': 'sd-image'}, limit=max_images)
	image_urls = [img['src'] for img in image_tags]
	return image_urls

	# Directory to save images
	save_dir = "sample_images"
	os.makedirs(save_dir, exist_ok=True)

	# Get image URLs
	image_urls = get_image_urls(search_url)

	# Download images
	for idx, image_url in enumerate(image_urls):
	save_path = os.path.join(save_dir, f"{qry}_{idx+1}.jpg")
	download_image(image_url, save_path)