Last active
October 10, 2020 23:34
-
-
Save safa-dayo/cbd7c64ad6de9aa451d6a35d7e87c5be to your computer and use it in GitHub Desktop.
Scrape Google images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ■こちらのYouTube動画で作成したコードです | |
# https://www.youtube.com/watch?v=qBQ7MNPDq7I | |
# | |
# ■下記のコードを写経しながら、スクレイピングについて学習した際のコードとなります | |
# https://gist.github.com/genekogan/ebd77196e4bf0705db51f86431099e57#gistcomment-3465381 | |
# | |
# コードの実行には下記のインストールが必要です | |
# Selenium, chromedriver-binary | |
from selenium import webdriver | |
import chromedriver_binary | |
import requests | |
import argparse | |
import os | |
import time | |
import shutil | |
def save_img(inp, img, i, directory): | |
try: | |
filename = inp + str(i) + ".jpg" | |
response = requests.get(img, stream=True) | |
image_path = os.path.join(directory, filename) | |
print(image_path) | |
with open(image_path, "wb") as file: | |
shutil.copyfileobj(response.raw, file) | |
except Exception: | |
print("ERROR: save_img") | |
pass | |
def find_urls(inp, url, driver, directory): | |
driver.get(url) | |
for _ in range(5): | |
driver.execute_script("window.scrollBy(0, 10000)") | |
try: | |
driver.find_element_by_css_selector('.mye4qd').click() | |
except: | |
continue | |
for j, imgurl, in enumerate(driver.find_elements_by_xpath('//img[contains(@class, "rg_i Q4LuWd")]')): | |
try: | |
imgurl.click() | |
img = driver.find_element_by_xpath('//body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src") | |
save_img(inp, img, j, directory) | |
time.sleep(3) | |
except: | |
print("ERROR: find_urls") | |
pass | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Scrape Google images") | |
parser.add_argument("-s", "--search", default="bananas", type=str, help="search term") | |
parser.add_argument("-d", "--directory", default="./downloads", type=str, help="Save directory") | |
args = parser.parse_args() | |
driver = webdriver.Chrome() | |
directory = args.directory | |
inp = args.search | |
if not os.path.isdir(directory): | |
os.makedirs(directory) | |
url = "https://www.google.com/search?q=" + str(inp) + "&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947" | |
find_urls(inp, url, driver, directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment