Skip to content

Instantly share code, notes, and snippets.

@vijayanandrp
Created November 27, 2017 04:33
Show Gist options
  • Save vijayanandrp/3a859da4b729e2e000b341d787cdf1fa to your computer and use it in GitHub Desktop.
Save vijayanandrp/3a859da4b729e2e000b341d787cdf1fa to your computer and use it in GitHub Desktop.
Simple method to scrape google image from google search engine. - https://informationcorners.com/google-search-image-scraper/
# coding: utf-8
import time # Importing the time library to check the time of code execution
import sys # Importing the System Library
import os
import urllib.request
import rfc6266, requests
import random
import string
# ########## Edit From Here ###########
# This list is used to search keywords. You can edit this list to search for google images of your choice.
# You can simply add and remove elements of the list.
search_keyword = ['famous']
# This list is used to further add suffix to your search term.
# Each element of the list will help you download 100 images.
# First element is blank which denotes that no suffix is added to the search keyword of the above list.
# You can edit the list by adding/deleting elements from it.
# So if the first element of the search_keyword is 'Australia' and the second element of keywords is 'high resolution',
# then it will search for 'Australia High Resolution'
keywords = ['quotes', 'sayings', 'best quotes', 'memes']
# local storage folder
storage_path = 'media'
storage_path = os.path.join(os.getcwd(), storage_path)
# ########## End of Editing ###########
error_links = []
error_count = 0
# random string
def random_string(n=8):
chars = string.ascii_uppercase + string.ascii_lowercase + string.digits
return ''.join([random.choice(chars) for _ in range(n)])
# Downloading entire Web Document (Raw Page Content)
def download_page(url):
try:
header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/41.0.2228.0 Safari/537.36"}
req = urllib.request.Request(url, headers=header)
resp = urllib.request.urlopen(req)
response_data = str(resp.read())
return response_data
except Exception as e:
error_links.append(url)
print(str(e))
# Finding 'Next Image' from the given raw page
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1: # If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"', start_line + 1)
end_content = s.find(',"ow"', start_content + 1)
content_raw = str(s[start_content + 6:end_content - 1])
return content_raw, end_content
# Getting all links with the help of '_images_get_next_image'
def _images_get_all_items(page):
_items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
error_url = item
fault_value = 'u003d'
if fault_value in error_url:
urls = error_url.split(fault_value, 2)
url = urls[1]
url = url.split('\\', 2)[0]
item = url
_items.append(item) # Append all the links in the list named 'Links'
time.sleep(0.1) # Timer could be used to slow down the request for image downloads
page = page[end_content:]
return _items
############## Main Program ############
t0 = time.time() # start the timer
# Download Image Links
index = 0
while index < len(search_keyword):
items = []
iteration = "Item no.: {} --> Item name = {}".format(index + 1, search_keyword[index])
print(iteration)
print("Evaluating...")
search_keywords = search_keyword[index]
search = search_keywords.replace(' ', '%20')
path_to_store = os.path.join(storage_path, search_keywords.strip().replace(' ', '_'))
# make a search keyword directory
if not os.path.exists(path_to_store):
os.makedirs(path_to_store)
j = 0
while j < len(keywords):
pure_keyword = keywords[j].replace(' ', '%20')
url = 'https://www.google.com/search?q=' + search + '%20' + pure_keyword + \
'&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items += _images_get_all_items(raw_html)
j += 1
# print ("Image Links = "+str(items))
print("Total Image Links = " + str(len(items)))
print("\n")
# This allows you to write all the links into a test file.
# This text file will be created in the same directory as your code.
# You can comment out the below 3 lines to stop writing the output to the text file.
# Open the text file called database.txt
with open('output.txt', 'a') as info:
info.write('\n')
info.write("Google search results for " + search_keyword[index - 1])
info.write("Total Image Links = " + str(len(items)))
info.write('\n')
info.write(str(index) + ': ' + str(search_keyword[index - 1]) + ": " + str(items) + "\n\n\n")
t1 = time.time() # stop the timer
total_time = t1 - t0 # Calculating the total time required to crawl,
print("Total time taken: " + str(total_time) + " Seconds")
print("Starting Download...")
# To save images to the same directory
# IN this saving process we are just skipping the URL if there is any error
k = 0
while k < len(items):
try:
header = dict()
header['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) " \
"Chrome/41.0.2228.0 Safari/537.36"
req = urllib.request.Request(items[k], headers=header)
print(items[k])
response = urllib.request.urlopen(req, None, 15)
file_name = rfc6266.parse_requests_response(response).filename_unsafe
file_to_write = os.path.join(path_to_store, random_string() + '_' + file_name)
with open(file_to_write, 'wb') as output_file:
data = response.read()
output_file.write(data)
response.close()
print("completed ====> " + str(k + 1))
k += 1
except Exception as e:
error_links.append(items[k])
error_count += 1
print("URLError " + str(k))
k += 1
t2 = time.time() # stop the timer
total_time = t2 - t1 # Calculating the total time required to crawl,
print("Total time taken: " + str(total_time) + " Seconds")
index += 1
print("\n")
print("Everything downloaded!")
print("\n" + str(error_count) + " ----> Total Errors")
print(error_links)
# ----End of the main program ----#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment