yucedagonurcan · May 28, 2019 20:07
diff --git a/GoogleImageDownloader.py b/GoogleImageDownloader.py
 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 from bs4 import BeautifulSoup
 import re
 import pandas as pd
 import os
 import time
 import requests as req
 import cv2
 import json
 import numpy as np


 # Chrome Driver path and openning the browser
 driver_path = "/Users/olmaditekrar/Documents/chromedriver"
 chromeOptions = webdriver.ChromeOptions()
 prefs = {'profile.managed_default_content_settings.images': 2}
 chromeOptions.add_experimental_option("prefs", prefs)
 driver = webdriver.Chrome(executable_path=driver_path,
                          chrome_options=chromeOptions)


 def validate_image_(img_path):
    try:
        image = cv2.imread(img_path)
        return image is not None
    except:
        print("Couldn't opened the image: {}".format(img_path))
        return False


 def validate_images(img_paths):
    deleted_count = 0
    for img_path in img_paths:
        if (not validate_image_(img_path)):
            print("[INFO] Deleting: {}".format(img_path))
            os.remove(img_path)
            deleted_count += 1
    return deleted_count


 def download_images(image_urls, output_dir):
    total = 0
    image_paths = []
    if (not os.path.isdir(output_dir)):
        os.makedirs(output_dir)

    for url in image_urls:
        try:
            # Save images to disk
            path_img = os.path.sep.join([output_dir, "{}.jpg".format(
                str(total).zfill(8)
            )])
            if (os.path.isfile(path_img)):
                total += 1
                image_paths.append(path_img)
                continue
            res = req.get(url=url, timeout=60)
            file = open(path_img, "wb")
            file.write(res.content)
            file.close()
            image_paths.append(path_img)

            # Updates
            print("[SUCCESS] downloaded: {}".format(path_img))
            total += 1
        except:
            print("[FAIL] downloading {}... skipping".format(url))
    return image_paths


 def go_img_search(keyword="Cairo"):
    # Go Google Image website with keyword.
    driver.get("https://www.google.co.in/search?q=" +
               keyword+"&source=lnms&tbm=isch")


 def scroll_until_num_imgs(num_imgs=150):
    current_num_elements = 0
    old_num_elements = -1
    imgs = []
    while(current_num_elements != old_num_elements and current_num_elements < num_imgs):
        old_num_elements = current_num_elements
        soup = BeautifulSoup(driver.page_source, 'lxml')
        imgs = soup.find_all("div", {"class": "rg_meta"})

        current_num_elements = len(imgs)

        # Check if "Show Other Results" Button is available
        if driver.find_element_by_xpath("*//input[@class='ksb'][@id='smb']").size['width'] != 0:
            # Scroll to the end of the page.
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            # Click "Show Other Results" Button
            driver.find_element_by_xpath(
                "*//input[@class='ksb'][@id='smb']").click()
        # Scroll to the end of the page.
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
    imgs = [json.loads(image.text)["ou"] for image in imgs]
    return imgs


 def main():

    num_of_images = 340
    search_query = "turtle"
    go_img_search(search_query)

    imgs = scroll_until_num_imgs(num_of_images)

    # Store the URLs in a text file.
    url_list_path = "{}.txt".format(search_query.replace(" ", "_"))
    np.savetxt(url_list_path, np.array(imgs), fmt="%s")

    print("Length of Imgs:", len(imgs))
    driver.quit()
    image_paths = download_images(
        image_urls=imgs, output_dir="Images/" + search_query)
    deleted_count = validate_images(img_paths=image_paths)
    print("[INFO] {} images removed for corrupted structure.".format(deleted_count))


 if __name__ == "__main__":
    main()
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from bs4 import BeautifulSoup
	import re
	import pandas as pd
	import os
	import time
	import requests as req
	import cv2
	import json
	import numpy as np


	# Chrome Driver path and openning the browser
	driver_path = "/Users/olmaditekrar/Documents/chromedriver"
	chromeOptions = webdriver.ChromeOptions()
	prefs = {'profile.managed_default_content_settings.images': 2}
	chromeOptions.add_experimental_option("prefs", prefs)
	driver = webdriver.Chrome(executable_path=driver_path,
	chrome_options=chromeOptions)


	def validate_image_(img_path):
	try:
	image = cv2.imread(img_path)
	return image is not None
	except:
	print("Couldn't opened the image: {}".format(img_path))
	return False


	def validate_images(img_paths):
	deleted_count = 0
	for img_path in img_paths:
	if (not validate_image_(img_path)):
	print("[INFO] Deleting: {}".format(img_path))
	os.remove(img_path)
	deleted_count += 1
	return deleted_count


	def download_images(image_urls, output_dir):
	total = 0
	image_paths = []
	if (not os.path.isdir(output_dir)):
	os.makedirs(output_dir)

	for url in image_urls:
	try:
	# Save images to disk
	path_img = os.path.sep.join([output_dir, "{}.jpg".format(
	str(total).zfill(8)
	)])
	if (os.path.isfile(path_img)):
	total += 1
	image_paths.append(path_img)
	continue
	res = req.get(url=url, timeout=60)
	file = open(path_img, "wb")
	file.write(res.content)
	file.close()
	image_paths.append(path_img)

	# Updates
	print("[SUCCESS] downloaded: {}".format(path_img))
	total += 1
	except:
	print("[FAIL] downloading {}... skipping".format(url))
	return image_paths


	def go_img_search(keyword="Cairo"):
	# Go Google Image website with keyword.
	driver.get("https://www.google.co.in/search?q=" +
	keyword+"&source=lnms&tbm=isch")


	def scroll_until_num_imgs(num_imgs=150):
	current_num_elements = 0
	old_num_elements = -1
	imgs = []
	while(current_num_elements != old_num_elements and current_num_elements < num_imgs):
	old_num_elements = current_num_elements
	soup = BeautifulSoup(driver.page_source, 'lxml')
	imgs = soup.find_all("div", {"class": "rg_meta"})

	current_num_elements = len(imgs)

	# Check if "Show Other Results" Button is available
	if driver.find_element_by_xpath("*//input[@class='ksb'][@id='smb']").size['width'] != 0:
	# Scroll to the end of the page.
	driver.execute_script(
	"window.scrollTo(0, document.body.scrollHeight);")
	# Click "Show Other Results" Button
	driver.find_element_by_xpath(
	"*//input[@class='ksb'][@id='smb']").click()
	# Scroll to the end of the page.
	driver.execute_script(
	"window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(1)
	imgs = [json.loads(image.text)["ou"] for image in imgs]
	return imgs


	def main():

	num_of_images = 340
	search_query = "turtle"
	go_img_search(search_query)

	imgs = scroll_until_num_imgs(num_of_images)

	# Store the URLs in a text file.
	url_list_path = "{}.txt".format(search_query.replace(" ", "_"))
	np.savetxt(url_list_path, np.array(imgs), fmt="%s")

	print("Length of Imgs:", len(imgs))
	driver.quit()
	image_paths = download_images(
	image_urls=imgs, output_dir="Images/" + search_query)
	deleted_count = validate_images(img_paths=image_paths)
	print("[INFO] {} images removed for corrupted structure.".format(deleted_count))


	if __name__ == "__main__":
	main()
No results found