Created
November 27, 2017 04:33
-
-
Save vijayanandrp/3a859da4b729e2e000b341d787cdf1fa to your computer and use it in GitHub Desktop.
Simple method to scrape google image from google search engine. - https://informationcorners.com/google-search-image-scraper/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import time # Importing the time library to check the time of code execution | |
import sys # Importing the System Library | |
import os | |
import urllib.request | |
import rfc6266, requests | |
import random | |
import string | |
# ########## Edit From Here ########### | |
# This list is used to search keywords. You can edit this list to search for google images of your choice. | |
# You can simply add and remove elements of the list. | |
search_keyword = ['famous'] | |
# This list is used to further add suffix to your search term. | |
# Each element of the list will help you download 100 images. | |
# First element is blank which denotes that no suffix is added to the search keyword of the above list. | |
# You can edit the list by adding/deleting elements from it. | |
# So if the first element of the search_keyword is 'Australia' and the second element of keywords is 'high resolution', | |
# then it will search for 'Australia High Resolution' | |
keywords = ['quotes', 'sayings', 'best quotes', 'memes'] | |
# local storage folder | |
storage_path = 'media' | |
storage_path = os.path.join(os.getcwd(), storage_path) | |
# ########## End of Editing ########### | |
error_links = [] | |
error_count = 0 | |
# random string | |
def random_string(n=8): | |
chars = string.ascii_uppercase + string.ascii_lowercase + string.digits | |
return ''.join([random.choice(chars) for _ in range(n)]) | |
# Downloading entire Web Document (Raw Page Content) | |
def download_page(url): | |
try: | |
header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, " | |
"like Gecko) Chrome/41.0.2228.0 Safari/537.36"} | |
req = urllib.request.Request(url, headers=header) | |
resp = urllib.request.urlopen(req) | |
response_data = str(resp.read()) | |
return response_data | |
except Exception as e: | |
error_links.append(url) | |
print(str(e)) | |
# Finding 'Next Image' from the given raw page | |
def _images_get_next_item(s): | |
start_line = s.find('rg_di') | |
if start_line == -1: # If no links are found then give an error! | |
end_quote = 0 | |
link = "no_links" | |
return link, end_quote | |
else: | |
start_line = s.find('"class="rg_meta"') | |
start_content = s.find('"ou"', start_line + 1) | |
end_content = s.find(',"ow"', start_content + 1) | |
content_raw = str(s[start_content + 6:end_content - 1]) | |
return content_raw, end_content | |
# Getting all links with the help of '_images_get_next_image' | |
def _images_get_all_items(page): | |
_items = [] | |
while True: | |
item, end_content = _images_get_next_item(page) | |
if item == "no_links": | |
break | |
else: | |
error_url = item | |
fault_value = 'u003d' | |
if fault_value in error_url: | |
urls = error_url.split(fault_value, 2) | |
url = urls[1] | |
url = url.split('\\', 2)[0] | |
item = url | |
_items.append(item) # Append all the links in the list named 'Links' | |
time.sleep(0.1) # Timer could be used to slow down the request for image downloads | |
page = page[end_content:] | |
return _items | |
############## Main Program ############ | |
t0 = time.time() # start the timer | |
# Download Image Links | |
index = 0 | |
while index < len(search_keyword): | |
items = [] | |
iteration = "Item no.: {} --> Item name = {}".format(index + 1, search_keyword[index]) | |
print(iteration) | |
print("Evaluating...") | |
search_keywords = search_keyword[index] | |
search = search_keywords.replace(' ', '%20') | |
path_to_store = os.path.join(storage_path, search_keywords.strip().replace(' ', '_')) | |
# make a search keyword directory | |
if not os.path.exists(path_to_store): | |
os.makedirs(path_to_store) | |
j = 0 | |
while j < len(keywords): | |
pure_keyword = keywords[j].replace(' ', '%20') | |
url = 'https://www.google.com/search?q=' + search + '%20' + pure_keyword + \ | |
'&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' | |
raw_html = (download_page(url)) | |
time.sleep(0.1) | |
items += _images_get_all_items(raw_html) | |
j += 1 | |
# print ("Image Links = "+str(items)) | |
print("Total Image Links = " + str(len(items))) | |
print("\n") | |
# This allows you to write all the links into a test file. | |
# This text file will be created in the same directory as your code. | |
# You can comment out the below 3 lines to stop writing the output to the text file. | |
# Open the text file called database.txt | |
with open('output.txt', 'a') as info: | |
info.write('\n') | |
info.write("Google search results for " + search_keyword[index - 1]) | |
info.write("Total Image Links = " + str(len(items))) | |
info.write('\n') | |
info.write(str(index) + ': ' + str(search_keyword[index - 1]) + ": " + str(items) + "\n\n\n") | |
t1 = time.time() # stop the timer | |
total_time = t1 - t0 # Calculating the total time required to crawl, | |
print("Total time taken: " + str(total_time) + " Seconds") | |
print("Starting Download...") | |
# To save images to the same directory | |
# IN this saving process we are just skipping the URL if there is any error | |
k = 0 | |
while k < len(items): | |
try: | |
header = dict() | |
header['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) " \ | |
"Chrome/41.0.2228.0 Safari/537.36" | |
req = urllib.request.Request(items[k], headers=header) | |
print(items[k]) | |
response = urllib.request.urlopen(req, None, 15) | |
file_name = rfc6266.parse_requests_response(response).filename_unsafe | |
file_to_write = os.path.join(path_to_store, random_string() + '_' + file_name) | |
with open(file_to_write, 'wb') as output_file: | |
data = response.read() | |
output_file.write(data) | |
response.close() | |
print("completed ====> " + str(k + 1)) | |
k += 1 | |
except Exception as e: | |
error_links.append(items[k]) | |
error_count += 1 | |
print("URLError " + str(k)) | |
k += 1 | |
t2 = time.time() # stop the timer | |
total_time = t2 - t1 # Calculating the total time required to crawl, | |
print("Total time taken: " + str(total_time) + " Seconds") | |
index += 1 | |
print("\n") | |
print("Everything downloaded!") | |
print("\n" + str(error_count) + " ----> Total Errors") | |
print(error_links) | |
# ----End of the main program ----# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment