Skip to content

Instantly share code, notes, and snippets.

@oltodosel
Created February 22, 2015 20:49
Show Gist options
  • Save oltodosel/716dd3771abf5b6112b3 to your computer and use it in GitHub Desktop.
Save oltodosel/716dd3771abf5b6112b3 to your computer and use it in GitHub Desktop.
Using google script seeks better quality copies of given images.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys, os, re, io
import requests, queue, threading
from PIL import Image
# Using google script seeks better quality copies of given images.
# Parses given dir with images, seeks similar with google and if there is a similar enough pic with bigger width, script replaces/adds it.
# Requires installed findimagedupes and imagemagick
# Uploader was copied from Rast1234
# https://github.com/Rast1234/imagesearch/blob/master/imagesearch.py
# number of threads; google might block your IP and redirect to captcha-checks if thread_count exceeds 5 or so
thread_count = 6
# timeout for connection to image's url in sec.
dtimeout = 10
# number of images to get from google; google sorts them by size, so 5 is enough
depth = 10
# similarity for findimagedupes; in percentage
similarity = 95
# True, if you want to replace old images with found ones
# otherwise found images will be named [filename]_new.[extension] and placed in the same dir
move = 1
# dir with images; slash at the end
imdir = os.path.expanduser('~/Downloads/ff/pics/')
# processed images will be moved to this dir; slash at the end
imdir_done = os.path.expanduser('~/Downloads/ff/pics2/')
# dir for temporary images; will be created with mkdir -p; slash at the end
# !!! It will be removed at the end of execution !!!
tmpdir = '/tmp/dev/gimgs/'
def fs (line):
import subprocess
PIPE = subprocess.PIPE
p = subprocess.Popen(line, shell=isinstance('', str),bufsize=-1, stdin=PIPE, stdout=PIPE,stderr=subprocess.STDOUT, close_fds=True)
return p.stdout.read().decode('utf-8').strip()
def tow(filename, data):
idop = open(filename, 'w')
idop.write(data)
idop.close()
def towb(filename, data):
idop = open(filename, 'wb')
#idop.write(bytes(data, 'UTF-8'))
idop.write(data)
idop.close()
def worker(dir_num):
cur_tmpdir = tmpdir + str(dir_num) + '/'
fs('mkdir -p ' + cur_tmpdir)
while 1:
try:
print(str(aicount) + '/' + str(q.qsize()) + '/' + str(threading.activeCount() - 1))
image = q.get(False)
######################################
# required input name and file name
fileDict = {'encoded_image': (image, open(image, 'rb'))}
# submit file via multipart/form-data, other fields not required
r = requests.post(postUrl, files=fileDict, cookies=grail.cookies, headers=headers)
# get the last redirect url, thank you Wireshark!
result = r.history[-1].url
#print(result)
########################################
# getting page with link to google-images
dd = requests.get(result, headers={'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'}).text
#tow('ddc' + str(dir_num) + '.htm', dd)
ll = re.findall('<span class="gl"><a href="(.*?)">Все размеры</a>', dd)
try:
ll = ll[0].replace('&amp;', '&')
except:
if 'Изображения других размеров не найдены' in dd:
fs('mv "' + image + '" "' + image.replace(imdir, imdir_done) + '"')
continue
elif 'This page appears when Google automatically detects requests coming from your computer network which appear to be in violation of the' in dd:
print('CAPTCHA!!! exiting...')
break
else:
#tow('dd' + str(dir_num) + '.htm', dd)
print('pushing image back')
q.put(image)
# getting page with links to images
dd = requests.get('https://www.google.ru' + ll, headers={'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'}).text
links = re.findall('\?imgurl=(.*?)&amp;', dd)
# download images
cur_n = 1
for url in links:
try:
dd = requests.get(url, timeout = dtimeout).content
Image.open(io.BytesIO(dd)).verify()
towb(cur_tmpdir + url.split('/')[-1], dd)
if depth > cur_n:
cur_n += 1
else:
break
except:
pass
best_size = 0
for img in fs('ls ' + cur_tmpdir).split('\n'):
s_image = fs('identify -format "%w" "' + image + '"')
dupe = fs('findimagedupes -q -t ' + str(similarity) + '% -i \'VIEW(){ identify -format "%w" "$1"; echo ---$1; identify -format "%w" "$2"; echo ---$2; }\' -- "' + image + '" "' + cur_tmpdir + img + '"').replace(s_image + '---' + image, '').strip().split('---')
if len(dupe) == 2:
if int(dupe[0]) > best_size:
best_size = int(dupe[0])
best_image = dupe[1]
if best_size > int(s_image):
print(s_image + ' -> ' + str(best_size) + ' ' + image.replace(imdir, ''))
if move:
fs('mv "' + best_image + '" "' + image.replace(imdir, imdir_done) + '"')
fs('rm "' + image + '"')
else:
fs('mv "' + best_image + '" "' + image.replace(imdir, imdir_done).rsplit('.',1)[0] + '_new.' + image.rsplit('.',1)[1] + '"')
else:
fs('mv "' + image + '" "' + image.replace(imdir, imdir_done) + '"')
fs('rm ' + cur_tmpdir + '*')
except queue.Empty:
fs('rmdir ' + cur_tmpdir)
break
except:
print(' Some Error!!!')
############################
url = "https://www.google.ru/imghp"
postUrl = "https://www.google.ru/searchbyimage/upload"
headers = {'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) \\'
'Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1',
'origin': 'https://www.google.ru',
'referer': 'https://www.google.ru/imghp'
}
# crusade for cookies
grail = requests.get(url, headers = headers)
############################
fs('mkdir -p ' + imdir_done)
q = queue.Queue()
for image in fs('ls "' + imdir + '"').split('\n'):
image = imdir + image
q.put(image)
aicount = q.qsize()
for i in range(thread_count):
t = threading.Thread(target=worker, args = (i,))
t.start()
fs('rmdir ' + tmpdir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment