Skip to content

Instantly share code, notes, and snippets.

@safa-dayo
Last active October 10, 2020 23:34
Show Gist options
  • Save safa-dayo/cbd7c64ad6de9aa451d6a35d7e87c5be to your computer and use it in GitHub Desktop.
Save safa-dayo/cbd7c64ad6de9aa451d6a35d7e87c5be to your computer and use it in GitHub Desktop.
Scrape Google images
# ■こちらのYouTube動画で作成したコードです
# https://www.youtube.com/watch?v=qBQ7MNPDq7I
#
# ■下記のコードを写経しながら、スクレイピングについて学習した際のコードとなります
# https://gist.github.com/genekogan/ebd77196e4bf0705db51f86431099e57#gistcomment-3465381
#
# コードの実行には下記のインストールが必要です
# Selenium, chromedriver-binary
from selenium import webdriver
import chromedriver_binary
import requests
import argparse
import os
import time
import shutil
def save_img(inp, img, i, directory):
try:
filename = inp + str(i) + ".jpg"
response = requests.get(img, stream=True)
image_path = os.path.join(directory, filename)
print(image_path)
with open(image_path, "wb") as file:
shutil.copyfileobj(response.raw, file)
except Exception:
print("ERROR: save_img")
pass
def find_urls(inp, url, driver, directory):
driver.get(url)
for _ in range(5):
driver.execute_script("window.scrollBy(0, 10000)")
try:
driver.find_element_by_css_selector('.mye4qd').click()
except:
continue
for j, imgurl, in enumerate(driver.find_elements_by_xpath('//img[contains(@class, "rg_i Q4LuWd")]')):
try:
imgurl.click()
img = driver.find_element_by_xpath('//body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
save_img(inp, img, j, directory)
time.sleep(3)
except:
print("ERROR: find_urls")
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape Google images")
parser.add_argument("-s", "--search", default="bananas", type=str, help="search term")
parser.add_argument("-d", "--directory", default="./downloads", type=str, help="Save directory")
args = parser.parse_args()
driver = webdriver.Chrome()
directory = args.directory
inp = args.search
if not os.path.isdir(directory):
os.makedirs(directory)
url = "https://www.google.com/search?q=" + str(inp) + "&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"
find_urls(inp, url, driver, directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment