Skip to content

Instantly share code, notes, and snippets.

@yucedagonurcan
Created May 28, 2019 20:07
Show Gist options
  • Select an option

  • Save yucedagonurcan/6aa44bea5051e8e6c3b35e095be2e1cf to your computer and use it in GitHub Desktop.

Select an option

Save yucedagonurcan/6aa44bea5051e8e6c3b35e095be2e1cf to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import time
import requests as req
import cv2
import json
import numpy as np
# Chrome Driver path and openning the browser
driver_path = "/Users/olmaditekrar/Documents/chromedriver"
chromeOptions = webdriver.ChromeOptions()
prefs = {'profile.managed_default_content_settings.images': 2}
chromeOptions.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(executable_path=driver_path,
chrome_options=chromeOptions)
def validate_image_(img_path):
try:
image = cv2.imread(img_path)
return image is not None
except:
print("Couldn't opened the image: {}".format(img_path))
return False
def validate_images(img_paths):
deleted_count = 0
for img_path in img_paths:
if (not validate_image_(img_path)):
print("[INFO] Deleting: {}".format(img_path))
os.remove(img_path)
deleted_count += 1
return deleted_count
def download_images(image_urls, output_dir):
total = 0
image_paths = []
if (not os.path.isdir(output_dir)):
os.makedirs(output_dir)
for url in image_urls:
try:
# Save images to disk
path_img = os.path.sep.join([output_dir, "{}.jpg".format(
str(total).zfill(8)
)])
if (os.path.isfile(path_img)):
total += 1
image_paths.append(path_img)
continue
res = req.get(url=url, timeout=60)
file = open(path_img, "wb")
file.write(res.content)
file.close()
image_paths.append(path_img)
# Updates
print("[SUCCESS] downloaded: {}".format(path_img))
total += 1
except:
print("[FAIL] downloading {}... skipping".format(url))
return image_paths
def go_img_search(keyword="Cairo"):
# Go Google Image website with keyword.
driver.get("https://www.google.co.in/search?q=" +
keyword+"&source=lnms&tbm=isch")
def scroll_until_num_imgs(num_imgs=150):
current_num_elements = 0
old_num_elements = -1
imgs = []
while(current_num_elements != old_num_elements and current_num_elements < num_imgs):
old_num_elements = current_num_elements
soup = BeautifulSoup(driver.page_source, 'lxml')
imgs = soup.find_all("div", {"class": "rg_meta"})
current_num_elements = len(imgs)
# Check if "Show Other Results" Button is available
if driver.find_element_by_xpath("*//input[@class='ksb'][@id='smb']").size['width'] != 0:
# Scroll to the end of the page.
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Click "Show Other Results" Button
driver.find_element_by_xpath(
"*//input[@class='ksb'][@id='smb']").click()
# Scroll to the end of the page.
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
imgs = [json.loads(image.text)["ou"] for image in imgs]
return imgs
def main():
num_of_images = 340
search_query = "turtle"
go_img_search(search_query)
imgs = scroll_until_num_imgs(num_of_images)
# Store the URLs in a text file.
url_list_path = "{}.txt".format(search_query.replace(" ", "_"))
np.savetxt(url_list_path, np.array(imgs), fmt="%s")
print("Length of Imgs:", len(imgs))
driver.quit()
image_paths = download_images(
image_urls=imgs, output_dir="Images/" + search_query)
deleted_count = validate_images(img_paths=image_paths)
print("[INFO] {} images removed for corrupted structure.".format(deleted_count))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment