Created
April 5, 2024 00:25
-
-
Save RichardNesbitt/4e07b87d69a8db8a24a8e52924e9e4dc to your computer and use it in GitHub Desktop.
Get urls of images on sub-pages using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This script still contains all of the print() outputs that I used while refining and debugging it. | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from urllib.parse import urljoin | |
from bs4 import BeautifulSoup | |
def find_products(url): | |
print(f"Fetching content from URL: {url}") | |
# Set up Selenium WebDriver | |
options = Options() | |
options.add_argument('--headless') | |
service = Service('/usr/local/bin/chromedriver') # Provide path to chromedriver executable, this is the path on Mac. | |
driver = webdriver.Chrome(service=service, options=options) | |
try: | |
# Load the URL | |
driver.get(url) | |
print("Page loaded successfully.") | |
# Wait for dynamically loaded content to appear (adjust timeout as needed) | |
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'img'))) | |
print("Dynamically loaded content appeared.") | |
# Get the HTML content after all dynamic content is loaded | |
html_content = driver.page_source | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Find all links containing 'products' in the path | |
# The links I needed to target on the given 'url' contained 'products' in their path. | |
product_links = [link.get('href') for link in soup.find_all('a', href=lambda href: href and 'products' in href)] | |
if product_links: | |
print(f"Found {len(product_links)} product links.") | |
else: | |
print("No product links found.") | |
# Iterate over each product link | |
for product_link in product_links: | |
print(f"Fetching content from product URL: {product_link}") | |
# Load the product page | |
driver.get(urljoin(url, product_link)) | |
print("Product page loaded successfully.") | |
# Wait for dynamically loaded content to appear (adjust timeout as needed) | |
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a'))) | |
print("Dynamically loaded content on product page appeared.") | |
# Wait for the first link matching the selector to become clickable | |
# I had to wait here because the images galleries are loaded with JS | |
# Clicking one of the images opens the gallery and puts 'ggbl_slider' in the DOM | |
link_element = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a'))) | |
link_element.click() | |
print("Clicked on the first link matching the selector.") | |
# Wait for dynamically loaded content to appear in ggbl_slider (adjust timeout as needed) | |
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, 'ggbl_slider'))) | |
print("Dynamically loaded content in ggbl_slider appeared.") | |
# Get the HTML content after all dynamic content is loaded on the product page | |
product_html_content = driver.page_source | |
product_soup = BeautifulSoup(product_html_content, 'html.parser') | |
# Find the <ul> element with ID "ggbl_slider" | |
ul_element = product_soup.find('ul', id='ggbl_slider') | |
# If <ul> element with ID "ggbl_slider" exists | |
if ul_element: | |
print("Found <ul> element with ID 'ggbl_slider'.") | |
# Find all <li> elements inside the <ul> | |
li_elements = ul_element.find_all('li') | |
# Iterate over each <li> element | |
for li in li_elements: | |
# Find the <img> tag inside the <li> | |
img_tag = li.find('img') | |
# If <img> tag exists, print the src attribute | |
if img_tag: | |
print(f"Image src: {img_tag.get('src')}") | |
else: | |
print("No <img> tag found.") | |
else: | |
print("No <ul> element with ID 'ggbl_slider' found.") | |
finally: | |
# Close the WebDriver | |
driver.quit() | |
# Example usage: | |
url = 'https://the-starting-page.com' | |
find_products(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment