RichardNesbitt · April 5, 2024 00:25
diff --git a/getImages.py b/getImages.py
 #This script still contains all of the print() outputs that I used while refining and debugging it.

 import time
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup

 def find_products(url):
    print(f"Fetching content from URL: {url}")

    # Set up Selenium WebDriver
    options = Options()
    options.add_argument('--headless')
    service = Service('/usr/local/bin/chromedriver')  # Provide path to chromedriver executable, this is the path on Mac.
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # Load the URL
        driver.get(url)
        print("Page loaded successfully.")

        # Wait for dynamically loaded content to appear (adjust timeout as needed)
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'img')))
        print("Dynamically loaded content appeared.")

        # Get the HTML content after all dynamic content is loaded
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all links containing 'products' in the path
        # The links I needed to target on the given 'url' contained 'products' in their path.
        product_links = [link.get('href') for link in soup.find_all('a', href=lambda href: href and 'products' in href)]
        if product_links:
            print(f"Found {len(product_links)} product links.")
        else:
            print("No product links found.")

        # Iterate over each product link
        for product_link in product_links:
            print(f"Fetching content from product URL: {product_link}")
            # Load the product page
            driver.get(urljoin(url, product_link))
            print("Product page loaded successfully.")

            # Wait for dynamically loaded content to appear (adjust timeout as needed)
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a')))
            print("Dynamically loaded content on product page appeared.")

            # Wait for the first link matching the selector to become clickable
            # I had to wait here because the images galleries are loaded with JS
            # Clicking one of the images opens the gallery and puts 'ggbl_slider' in the DOM
            link_element = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a')))
            link_element.click()
            print("Clicked on the first link matching the selector.")

            # Wait for dynamically loaded content to appear in ggbl_slider (adjust timeout as needed)
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, 'ggbl_slider')))
            print("Dynamically loaded content in ggbl_slider appeared.")

            # Get the HTML content after all dynamic content is loaded on the product page
            product_html_content = driver.page_source
            product_soup = BeautifulSoup(product_html_content, 'html.parser')

            # Find the <ul> element with ID "ggbl_slider"
            ul_element = product_soup.find('ul', id='ggbl_slider')

            # If <ul> element with ID "ggbl_slider" exists
            if ul_element:
                print("Found <ul> element with ID 'ggbl_slider'.")

                # Find all <li> elements inside the <ul>
                li_elements = ul_element.find_all('li')

                # Iterate over each <li> element
                for li in li_elements:
                    # Find the <img> tag inside the <li>
                    img_tag = li.find('img')

                    # If <img> tag exists, print the src attribute
                    if img_tag:
                        print(f"Image src: {img_tag.get('src')}")
                    else:
                        print("No <img> tag found.")
            else:
                print("No <ul> element with ID 'ggbl_slider' found.")

    finally:
        # Close the WebDriver
        driver.quit()

 # Example usage:
 url = 'https://the-starting-page.com'
 find_products(url)
	#This script still contains all of the print() outputs that I used while refining and debugging it.

	import time
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from urllib.parse import urljoin
	from bs4 import BeautifulSoup

	def find_products(url):
	print(f"Fetching content from URL: {url}")

	# Set up Selenium WebDriver
	options = Options()
	options.add_argument('--headless')
	service = Service('/usr/local/bin/chromedriver') # Provide path to chromedriver executable, this is the path on Mac.
	driver = webdriver.Chrome(service=service, options=options)

	try:
	# Load the URL
	driver.get(url)
	print("Page loaded successfully.")

	# Wait for dynamically loaded content to appear (adjust timeout as needed)
	WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'img')))
	print("Dynamically loaded content appeared.")

	# Get the HTML content after all dynamic content is loaded
	html_content = driver.page_source
	soup = BeautifulSoup(html_content, 'html.parser')

	# Find all links containing 'products' in the path
	# The links I needed to target on the given 'url' contained 'products' in their path.
	product_links = [link.get('href') for link in soup.find_all('a', href=lambda href: href and 'products' in href)]
	if product_links:
	print(f"Found {len(product_links)} product links.")
	else:
	print("No product links found.")

	# Iterate over each product link
	for product_link in product_links:
	print(f"Fetching content from product URL: {product_link}")
	# Load the product page
	driver.get(urljoin(url, product_link))
	print("Product page loaded successfully.")

	# Wait for dynamically loaded content to appear (adjust timeout as needed)
	WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a')))
	print("Dynamically loaded content on product page appeared.")

	# Wait for the first link matching the selector to become clickable
	# I had to wait here because the images galleries are loaded with JS
	# Clicking one of the images opens the gallery and puts 'ggbl_slider' in the DOM
	link_element = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a')))
	link_element.click()
	print("Clicked on the first link matching the selector.")

	# Wait for dynamically loaded content to appear in ggbl_slider (adjust timeout as needed)
	WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, 'ggbl_slider')))
	print("Dynamically loaded content in ggbl_slider appeared.")

	# Get the HTML content after all dynamic content is loaded on the product page
	product_html_content = driver.page_source
	product_soup = BeautifulSoup(product_html_content, 'html.parser')

	# Find the <ul> element with ID "ggbl_slider"
	ul_element = product_soup.find('ul', id='ggbl_slider')

	# If <ul> element with ID "ggbl_slider" exists
	if ul_element:
	print("Found <ul> element with ID 'ggbl_slider'.")

	# Find all <li> elements inside the <ul>
	li_elements = ul_element.find_all('li')

	# Iterate over each <li> element
	for li in li_elements:
	# Find the <img> tag inside the <li>
	img_tag = li.find('img')

	# If <img> tag exists, print the src attribute
	if img_tag:
	print(f"Image src: {img_tag.get('src')}")
	else:
	print("No <img> tag found.")
	else:
	print("No <ul> element with ID 'ggbl_slider' found.")

	finally:
	# Close the WebDriver
	driver.quit()

	# Example usage:
	url = 'https://the-starting-page.com'
	find_products(url)