Skip to content

Instantly share code, notes, and snippets.

@bashkirtsevich
Last active February 15, 2019 08:30
Show Gist options
  • Save bashkirtsevich/49058a38e000cd767ceb7f2f33ed8085 to your computer and use it in GitHub Desktop.
Save bashkirtsevich/49058a38e000cd767ceb7f2f33ed8085 to your computer and use it in GitHub Desktop.
Google image search
# Python 3.7
import asyncio
import json
import os
import re
import aiofiles
from aiohttp import ClientSession, ClientTimeout
async def http_query(url, load_cb):
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
async with ClientSession(headers={"User-Agent": user_agent}, timeout=ClientTimeout(total=60)) as session:
try:
async with session.get(url) as response:
return await load_cb(response)
except Exception as e:
print("GET error: '{}'".format(str(e)))
async def http_read_bytes(url):
return await http_query(url, lambda response: response.read())
async def http_save(url, path):
try:
data = await http_read_bytes(url)
if data:
async with aiofiles.open(path, "wb") as f:
await f.write(data)
except:
pass
async def http_read_text(url):
return await http_query(url, lambda response: response.text())
async def save_images(query, name_prefix, save_path=""):
search_url = f"https://www.google.com/search?q={query}&source=lnms&tbm=isch"
page = await http_read_text(search_url)
return asyncio.gather(*[
http_save(
json.loads(f"\"{url}\""),
os.path.join(save_path, f"{name_prefix}_{idx+1}.jpg")
)
for idx, url in enumerate(re.findall("\"ou\":\"(.*?)\"", page))
])
async def main():
await asyncio.gather(*[
await save_images(query, query, "images")
for query in ["arachnid", "spider", "scorpion", "scorpion+arachnid"]
])
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
import cookielib
import json
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
query = raw_input("query image")# you can change the query for the image here
image_type="ActiOn"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
print url
#add the directory for your image here
DIR="Pictures"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
print "there are total" , len(ActualImages),"images"
if not os.path.exists(DIR):
os.mkdir(DIR)
DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
###print images
for i , (img , Type) in enumerate( ActualImages):
try:
req = urllib2.Request(img, headers={'User-Agent' : header})
raw_img = urllib2.urlopen(req).read()
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print cntr
if len(Type)==0:
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+".jpg"), 'wb')
else :
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print "could not load : "+img
print e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment