FBosler · May 8, 2023 12:33 · WittmannF · Aug 29, 2020 · Manuj229 · May 25, 2021
diff --git a/get_image_links_new.py b/get_image_links_new.py
 #Copyright 2022 Fabian Bosler

 # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
 # files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 # modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom 
 # the Software is furnished to do so, subject to the following conditions:

 # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 
 # Software.

 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
 # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS 
 # OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
	#Copyright 2022 Fabian Bosler

	# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
	# files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
	# modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom
	# the Software is furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
	# Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
	# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
	# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
	# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
	def scroll_to_end(wd):
	wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(sleep_between_interactions)

	# build the google query
	search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

	# load the page
	wd.get(search_url.format(q=query))

	image_urls = set()
	image_count = 0
	results_start = 0
	while image_count < max_links_to_fetch:
	scroll_to_end(wd)

	# get all image thumbnail results
	thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
	number_results = len(thumbnail_results)

	print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

	for img in thumbnail_results[results_start:number_results]:
	# try to click every thumbnail such that we can get the real image behind it
	try:
	img.click()
	time.sleep(sleep_between_interactions)
	except Exception:
	continue

	# extract image urls
	actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
	for actual_image in actual_images:
	if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
	image_urls.add(actual_image.get_attribute('src'))

	image_count = len(image_urls)

	if len(image_urls) >= max_links_to_fetch:
	print(f"Found: {len(image_urls)} image links, done!")
	break
	else:
	print("Found:", len(image_urls), "image links, looking for more ...")
	time.sleep(30)
	return
	load_more_button = wd.find_element_by_css_selector(".mye4qd")
	if load_more_button:
	wd.execute_script("document.querySelector('.mye4qd').click();")

	# move the result startpoint further down
	results_start = len(thumbnail_results)

	return image_urls
No results found