vijayanandrp · November 27, 2017 04:33
diff --git a/google-image-download.py b/google-image-download.py
 # coding: utf-8

 import time  # Importing the time library to check the time of code execution
 import sys  # Importing the System Library
 import os
 import urllib.request
 import rfc6266, requests
 import random
 import string

 # ########## Edit From Here ###########

 # This list is used to search keywords. You can edit this list to search for google images of your choice.
 # You can simply add and remove elements of the list.
 search_keyword = ['famous']

 # This list is used to further add suffix to your search term.
 # Each element of the list will help you download 100 images.
 # First element is blank which denotes that no suffix is added to the search keyword of the above list.
 # You can edit the list by adding/deleting elements from it.
 # So if the first element of the search_keyword is 'Australia' and the second element of keywords is 'high resolution',
 #  then it will search for 'Australia High Resolution'
 keywords = ['quotes', 'sayings', 'best quotes', 'memes']

 # local storage folder
 storage_path = 'media'
 storage_path = os.path.join(os.getcwd(), storage_path)

 # ########## End of Editing ###########

 error_links = []
 error_count = 0


 # random string
 def random_string(n=8):
    chars = string.ascii_uppercase + string.ascii_lowercase + string.digits
    return ''.join([random.choice(chars) for _ in range(n)])


 # Downloading entire Web Document (Raw Page Content)
 def download_page(url):
    try:
        header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, "
                                "like Gecko) Chrome/41.0.2228.0 Safari/537.36"}
        req = urllib.request.Request(url, headers=header)
        resp = urllib.request.urlopen(req)
        response_data = str(resp.read())
        return response_data
    except Exception as e:
        error_links.append(url)
        print(str(e))


 # Finding 'Next Image' from the given raw page
 def _images_get_next_item(s):
    start_line = s.find('rg_di')
    if start_line == -1:  # If no links are found then give an error!
        end_quote = 0
        link = "no_links"
        return link, end_quote
    else:
        start_line = s.find('"class="rg_meta"')
        start_content = s.find('"ou"', start_line + 1)
        end_content = s.find(',"ow"', start_content + 1)
        content_raw = str(s[start_content + 6:end_content - 1])
        return content_raw, end_content


 # Getting all links with the help of '_images_get_next_image'
 def _images_get_all_items(page):
    _items = []
    while True:
        item, end_content = _images_get_next_item(page)
        if item == "no_links":
            break
        else:
            error_url = item
            fault_value = 'u003d'
            if fault_value in error_url:
                urls = error_url.split(fault_value, 2)
                url = urls[1]
                url = url.split('\\', 2)[0]
                item = url
            _items.append(item)  # Append all the links in the list named 'Links'
            time.sleep(0.1)  # Timer could be used to slow down the request for image downloads
            page = page[end_content:]
    return _items

 ############## Main Program ############
 t0 = time.time()  # start the timer

 # Download Image Links
 index = 0
 while index < len(search_keyword):
    items = []
    iteration = "Item no.: {} -->  Item name =  {}".format(index + 1, search_keyword[index])
    print(iteration)
    print("Evaluating...")
    search_keywords = search_keyword[index]
    search = search_keywords.replace(' ', '%20')
    path_to_store = os.path.join(storage_path, search_keywords.strip().replace(' ', '_'))

    # make a search keyword  directory
    if not os.path.exists(path_to_store):
        os.makedirs(path_to_store)

    j = 0
    while j < len(keywords):
        pure_keyword = keywords[j].replace(' ', '%20')
        url = 'https://www.google.com/search?q=' + search + '%20' + pure_keyword + \
              '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
        raw_html = (download_page(url))
        time.sleep(0.1)
        items += _images_get_all_items(raw_html)
        j += 1

    # print ("Image Links = "+str(items))
    print("Total Image Links = " + str(len(items)))
    print("\n")

    # This allows you to write all the links into a test file.
    # This text file will be created in the same directory as your code.
    # You can comment out the below 3 lines to stop writing the output to the text file.
    # Open the text file called database.txt
    with open('output.txt', 'a') as info:
        info.write('\n')
        info.write("Google search results for " + search_keyword[index - 1])
        info.write("Total Image Links = " + str(len(items)))
        info.write('\n')
        info.write(str(index) + ': ' + str(search_keyword[index - 1]) + ": " + str(items) + "\n\n\n")

    t1 = time.time()  # stop the timer
    total_time = t1 - t0  # Calculating the total time required to crawl,
    print("Total time taken: " + str(total_time) + " Seconds")
    print("Starting Download...")

    # To save images to the same directory
    # IN this saving process we are just skipping the URL if there is any error

    k = 0
    while k < len(items):
        try:
            header = dict()
            header['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) " \
                                   "Chrome/41.0.2228.0 Safari/537.36"
            req = urllib.request.Request(items[k], headers=header)
            print(items[k])
            response = urllib.request.urlopen(req, None, 15)
            file_name = rfc6266.parse_requests_response(response).filename_unsafe
            file_to_write = os.path.join(path_to_store, random_string() + '_' + file_name)

            with open(file_to_write, 'wb') as output_file:
                data = response.read()
                output_file.write(data)
            response.close()
            print("completed ====> " + str(k + 1))
            k += 1
        except Exception as e:
            error_links.append(items[k])
            error_count += 1
            print("URLError " + str(k))
            k += 1
    t2 = time.time()  # stop the timer
    total_time = t2 - t1  # Calculating the total time required to crawl,
    print("Total time taken: " + str(total_time) + " Seconds")
    index += 1

 print("\n")
 print("Everything downloaded!")
 print("\n" + str(error_count) + " ----> Total Errors")
 print(error_links)
 # ----End of the main program ----#
	# coding: utf-8

	import time # Importing the time library to check the time of code execution
	import sys # Importing the System Library
	import os
	import urllib.request
	import rfc6266, requests
	import random
	import string

	# ########## Edit From Here ###########

	# This list is used to search keywords. You can edit this list to search for google images of your choice.
	# You can simply add and remove elements of the list.
	search_keyword = ['famous']

	# This list is used to further add suffix to your search term.
	# Each element of the list will help you download 100 images.
	# First element is blank which denotes that no suffix is added to the search keyword of the above list.
	# You can edit the list by adding/deleting elements from it.
	# So if the first element of the search_keyword is 'Australia' and the second element of keywords is 'high resolution',
	# then it will search for 'Australia High Resolution'
	keywords = ['quotes', 'sayings', 'best quotes', 'memes']

	# local storage folder
	storage_path = 'media'
	storage_path = os.path.join(os.getcwd(), storage_path)

	# ########## End of Editing ###########

	error_links = []
	error_count = 0


	# random string
	def random_string(n=8):
	chars = string.ascii_uppercase + string.ascii_lowercase + string.digits
	return ''.join([random.choice(chars) for _ in range(n)])


	# Downloading entire Web Document (Raw Page Content)
	def download_page(url):
	try:
	header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, "
	"like Gecko) Chrome/41.0.2228.0 Safari/537.36"}
	req = urllib.request.Request(url, headers=header)
	resp = urllib.request.urlopen(req)
	response_data = str(resp.read())
	return response_data
	except Exception as e:
	error_links.append(url)
	print(str(e))


	# Finding 'Next Image' from the given raw page
	def _images_get_next_item(s):
	start_line = s.find('rg_di')
	if start_line == -1: # If no links are found then give an error!
	end_quote = 0
	link = "no_links"
	return link, end_quote
	else:
	start_line = s.find('"class="rg_meta"')
	start_content = s.find('"ou"', start_line + 1)
	end_content = s.find(',"ow"', start_content + 1)
	content_raw = str(s[start_content + 6:end_content - 1])
	return content_raw, end_content


	# Getting all links with the help of '_images_get_next_image'
	def _images_get_all_items(page):
	_items = []
	while True:
	item, end_content = _images_get_next_item(page)
	if item == "no_links":
	break
	else:
	error_url = item
	fault_value = 'u003d'
	if fault_value in error_url:
	urls = error_url.split(fault_value, 2)
	url = urls[1]
	url = url.split('\\', 2)[0]
	item = url
	_items.append(item) # Append all the links in the list named 'Links'
	time.sleep(0.1) # Timer could be used to slow down the request for image downloads
	page = page[end_content:]
	return _items

	############## Main Program ############
	t0 = time.time() # start the timer

	# Download Image Links
	index = 0
	while index < len(search_keyword):
	items = []
	iteration = "Item no.: {} --> Item name = {}".format(index + 1, search_keyword[index])
	print(iteration)
	print("Evaluating...")
	search_keywords = search_keyword[index]
	search = search_keywords.replace(' ', '%20')
	path_to_store = os.path.join(storage_path, search_keywords.strip().replace(' ', '_'))

	# make a search keyword directory
	if not os.path.exists(path_to_store):
	os.makedirs(path_to_store)

	j = 0
	while j < len(keywords):
	pure_keyword = keywords[j].replace(' ', '%20')
	url = 'https://www.google.com/search?q=' + search + '%20' + pure_keyword + \
	'&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
	raw_html = (download_page(url))
	time.sleep(0.1)
	items += _images_get_all_items(raw_html)
	j += 1

	# print ("Image Links = "+str(items))
	print("Total Image Links = " + str(len(items)))
	print("\n")

	# This allows you to write all the links into a test file.
	# This text file will be created in the same directory as your code.
	# You can comment out the below 3 lines to stop writing the output to the text file.
	# Open the text file called database.txt
	with open('output.txt', 'a') as info:
	info.write('\n')
	info.write("Google search results for " + search_keyword[index - 1])
	info.write("Total Image Links = " + str(len(items)))
	info.write('\n')
	info.write(str(index) + ': ' + str(search_keyword[index - 1]) + ": " + str(items) + "\n\n\n")

	t1 = time.time() # stop the timer
	total_time = t1 - t0 # Calculating the total time required to crawl,
	print("Total time taken: " + str(total_time) + " Seconds")
	print("Starting Download...")

	# To save images to the same directory
	# IN this saving process we are just skipping the URL if there is any error

	k = 0
	while k < len(items):
	try:
	header = dict()
	header['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) " \
	"Chrome/41.0.2228.0 Safari/537.36"
	req = urllib.request.Request(items[k], headers=header)
	print(items[k])
	response = urllib.request.urlopen(req, None, 15)
	file_name = rfc6266.parse_requests_response(response).filename_unsafe
	file_to_write = os.path.join(path_to_store, random_string() + '_' + file_name)

	with open(file_to_write, 'wb') as output_file:
	data = response.read()
	output_file.write(data)
	response.close()
	print("completed ====> " + str(k + 1))
	k += 1
	except Exception as e:
	error_links.append(items[k])
	error_count += 1
	print("URLError " + str(k))
	k += 1
	t2 = time.time() # stop the timer
	total_time = t2 - t1 # Calculating the total time required to crawl,
	print("Total time taken: " + str(total_time) + " Seconds")
	index += 1

	print("\n")
	print("Everything downloaded!")
	print("\n" + str(error_count) + " ----> Total Errors")
	print(error_links)
	# ----End of the main program ----#