Created
May 28, 2019 20:07
-
-
Save yucedagonurcan/6aa44bea5051e8e6c3b35e095be2e1cf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.common.keys import Keys | |
| from bs4 import BeautifulSoup | |
| import re | |
| import pandas as pd | |
| import os | |
| import time | |
| import requests as req | |
| import cv2 | |
| import json | |
| import numpy as np | |
| # Chrome Driver path and openning the browser | |
| driver_path = "/Users/olmaditekrar/Documents/chromedriver" | |
| chromeOptions = webdriver.ChromeOptions() | |
| prefs = {'profile.managed_default_content_settings.images': 2} | |
| chromeOptions.add_experimental_option("prefs", prefs) | |
| driver = webdriver.Chrome(executable_path=driver_path, | |
| chrome_options=chromeOptions) | |
| def validate_image_(img_path): | |
| try: | |
| image = cv2.imread(img_path) | |
| return image is not None | |
| except: | |
| print("Couldn't opened the image: {}".format(img_path)) | |
| return False | |
| def validate_images(img_paths): | |
| deleted_count = 0 | |
| for img_path in img_paths: | |
| if (not validate_image_(img_path)): | |
| print("[INFO] Deleting: {}".format(img_path)) | |
| os.remove(img_path) | |
| deleted_count += 1 | |
| return deleted_count | |
| def download_images(image_urls, output_dir): | |
| total = 0 | |
| image_paths = [] | |
| if (not os.path.isdir(output_dir)): | |
| os.makedirs(output_dir) | |
| for url in image_urls: | |
| try: | |
| # Save images to disk | |
| path_img = os.path.sep.join([output_dir, "{}.jpg".format( | |
| str(total).zfill(8) | |
| )]) | |
| if (os.path.isfile(path_img)): | |
| total += 1 | |
| image_paths.append(path_img) | |
| continue | |
| res = req.get(url=url, timeout=60) | |
| file = open(path_img, "wb") | |
| file.write(res.content) | |
| file.close() | |
| image_paths.append(path_img) | |
| # Updates | |
| print("[SUCCESS] downloaded: {}".format(path_img)) | |
| total += 1 | |
| except: | |
| print("[FAIL] downloading {}... skipping".format(url)) | |
| return image_paths | |
| def go_img_search(keyword="Cairo"): | |
| # Go Google Image website with keyword. | |
| driver.get("https://www.google.co.in/search?q=" + | |
| keyword+"&source=lnms&tbm=isch") | |
| def scroll_until_num_imgs(num_imgs=150): | |
| current_num_elements = 0 | |
| old_num_elements = -1 | |
| imgs = [] | |
| while(current_num_elements != old_num_elements and current_num_elements < num_imgs): | |
| old_num_elements = current_num_elements | |
| soup = BeautifulSoup(driver.page_source, 'lxml') | |
| imgs = soup.find_all("div", {"class": "rg_meta"}) | |
| current_num_elements = len(imgs) | |
| # Check if "Show Other Results" Button is available | |
| if driver.find_element_by_xpath("*//input[@class='ksb'][@id='smb']").size['width'] != 0: | |
| # Scroll to the end of the page. | |
| driver.execute_script( | |
| "window.scrollTo(0, document.body.scrollHeight);") | |
| # Click "Show Other Results" Button | |
| driver.find_element_by_xpath( | |
| "*//input[@class='ksb'][@id='smb']").click() | |
| # Scroll to the end of the page. | |
| driver.execute_script( | |
| "window.scrollTo(0, document.body.scrollHeight);") | |
| time.sleep(1) | |
| imgs = [json.loads(image.text)["ou"] for image in imgs] | |
| return imgs | |
| def main(): | |
| num_of_images = 340 | |
| search_query = "turtle" | |
| go_img_search(search_query) | |
| imgs = scroll_until_num_imgs(num_of_images) | |
| # Store the URLs in a text file. | |
| url_list_path = "{}.txt".format(search_query.replace(" ", "_")) | |
| np.savetxt(url_list_path, np.array(imgs), fmt="%s") | |
| print("Length of Imgs:", len(imgs)) | |
| driver.quit() | |
| image_paths = download_images( | |
| image_urls=imgs, output_dir="Images/" + search_query) | |
| deleted_count = validate_images(img_paths=image_paths) | |
| print("[INFO] {} images removed for corrupted structure.".format(deleted_count)) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment