Created
May 29, 2014 10:07
-
-
Save erogol/332bc60ce694a87e9b35 to your computer and use it in GitHub Desktop.
bing scrabber but add more to query address
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
from bs4 import BeautifulSoup | |
import requests | |
import urllib2 | |
import os | |
import re, urlparse | |
import time | |
import pdb | |
from interruptingcow import timeout | |
import cPickle | |
from multiprocessing import Pool | |
from subprocess import Popen, PIPE | |
from collections import Counter | |
''' | |
TODO: | |
-> Some of the donwloaded images are mal_formatted files. Correct This | |
-> Some download links are internal Bing links so Bing pages also downloaded. Correct This | |
-> get_link() defines root_url in itself. Give it as an argument | |
-> if link list is written before do not scrap again | |
''' | |
def chunk(l, n): | |
if n<1: | |
n=1 | |
return [l[i:i+n] for i in range(0, len(l), n)] | |
def run(cmd): | |
print '-'*40 | |
print 'running:', cmd | |
p = Popen(cmd, stderr=PIPE, stdout=PIPE, shell=True) | |
output, errors = p.communicate() | |
print [p.returncode, errors, output] | |
if p.returncode or errors: | |
print 'something went wrong...' | |
def get_soup(url): | |
# some times get gives socket error. | |
try: | |
r = requests.get(url) | |
except: | |
return None | |
if r.status_code == 200: | |
return BeautifulSoup(r.text) | |
else: | |
return None | |
# handle non-ascii characters | |
def urlEncodeNonAscii(b): | |
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b) | |
def iriToUri(iri): | |
parts= urlparse.urlparse(iri) | |
return urlparse.urlunparse( | |
part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8')) | |
for parti, part in enumerate(parts) | |
) | |
def full_run(query, output_path, output_img_paths, num_imgs = 800): | |
name = query | |
root_url = 'http://m.bing.com' | |
raw_query = query.replace(' ','_') | |
# query = query.replace(' ','+') + '+face' | |
query = query.replace(' ','+') | |
print query | |
img_link_file_path = output_img_paths+query+'_links.txt' | |
# if link file is exist do not crawle | |
# if os.path.exists(query+'.txt'): | |
# Collect real img links from img show ups of Bing | |
#p = Pool(100) | |
#query_list = zip([query]*num_imgs, range(num_imgs)) | |
#img_links = p.map(get_link, query_list) | |
if os.path.exists(img_link_file_path): | |
f = open(img_link_file_path, 'r') | |
img_links = f.readlines() | |
f.close() | |
else: | |
img_links = [] | |
bad_img_links = [] # keep malfuncitoning links | |
img_counter = 0 | |
raw_counter = -1 | |
old_links = Counter() | |
while img_counter < num_imgs: | |
raw_counter += 1 | |
print img_counter, ' of query ', query | |
with timeout(20, exception=RuntimeError): | |
try: | |
link = get_link([query, raw_counter]) | |
except TimeoutError: | |
print 'Timeout!!!!' | |
img_counter += 1 | |
continue | |
# wait!! maybe Bing banned you | |
if link == None: | |
time.sleep(5) | |
img_counter += 1 | |
continue | |
try: | |
con = urllib2.urlopen(link) | |
except: | |
bad_img_links.append(link) | |
img_counter += 1 | |
continue | |
# if link does not work do not count | |
if con.getcode() != 200: | |
bad_img_links.append(link) | |
img_counter += 1 | |
continue | |
print "link ", link | |
# time.sleep(0.5) | |
if old_links[link] == 0: | |
img_links.append(link) | |
img_counter += 1 | |
old_links[link]+=1; | |
else: | |
print "Duplicate Link!!" | |
img_counter += 1 | |
continue | |
# Save img_links to a file | |
f = open(img_link_file_path, 'w') | |
for img_link in img_links: | |
if img_link != None: | |
f.write("%s\n" % iriToUri(img_link)) | |
f.close() | |
# Save bad links | |
f = open(output_img_paths+query+'_bad_inks.txt', 'w') | |
for img_link in bad_img_links: | |
if img_link != None: | |
f.write("%s\n" % iriToUri(img_link)) | |
f.close() | |
# Create root class folder if not exists | |
try: | |
fold_path = output_path+raw_query | |
if ~os.path.exists(fold_path): | |
os.makedirs(fold_path) | |
except: | |
pass | |
# pdb.set_trace() | |
for count,img in enumerate(img_links): | |
if img != None: | |
#print img | |
img = str.strip(img) | |
out_path = fold_path+"/"+ str(count) + ".jpg" | |
# command = 'wget -O '+out_path+' -o download.log -A.jpeg,.jpg -b ' + iriToUri(img) | |
img = iriToUri(img) | |
print img, ' to be donwloaded' | |
command = 'wget -O '+out_path+' -t 1 -o download.log --timeout=600 ' + img | |
# os.system(command) | |
run(command) | |
time.sleep(2) | |
else: | |
#print img | |
print 'IS NONE!!!' | |
#raw_img = urllib2.urlopen(img).read() | |
# cntr = len([i for i in os.listdir("images") if image_type in i]) + 1 | |
# f = open("images/" + image_type + "_"+ str(count), 'wb') | |
# f.write(raw_img) | |
# f.close() | |
return True | |
def gather_img_files(query, output_path, output_img_paths, num_imgs = 800): | |
name = query | |
root_url = 'http://m.bing.com' | |
raw_query = query.replace(' ','_') | |
# query = query.replace(' ','+') + '+face' | |
query = query.replace(' ','+') | |
print query | |
img_link_file_path = output_img_paths+query+'_links.txt' | |
img_links = [] | |
bad_img_links = [] # keep malfuncitoning links | |
img_counter = 0 | |
raw_counter = -1 | |
old_links = Counter() | |
while img_counter < num_imgs: | |
raw_counter += 1 | |
try: | |
with timeout(5, exception=RuntimeError): | |
print img_counter, ' of query ', query | |
link = get_link([query, raw_counter]) | |
# wait!! maybe Bing banned you | |
if link == None: | |
time.sleep(5) | |
img_counter += 1 | |
continue | |
# try: | |
# print 'Checking Link!!!' | |
# con = urllib2.urlopen(link) | |
# except: | |
# bad_img_links.append(link) | |
# img_counter += 1 | |
# continue | |
# # if link does not work do not count | |
# if con.getcode() != 200: | |
# bad_img_links.append(link) | |
# img_counter += 1 | |
# continue | |
print "link ", link | |
# time.sleep(0.5) | |
if old_links[link] == 0: | |
img_links.append(link) | |
img_counter += 1 | |
old_links[link]+=1; | |
else: | |
print "Duplicate Link!!" | |
img_counter += 1 | |
continue | |
except RuntimeError: | |
print 'Timeout!!!!' | |
img_counter += 1 | |
continue | |
# Save img_links to a file | |
f = open(img_link_file_path, 'w') | |
for img_link in img_links: | |
if img_link != None: | |
f.write("%s\n" % iriToUri(img_link)) | |
f.close() | |
# Save bad links | |
f = open(output_img_paths+query+'_bad_inks.txt', 'w') | |
for img_link in bad_img_links: | |
if img_link != None: | |
f.write("%s\n" % iriToUri(img_link)) | |
f.close() | |
# slave code for multi-rocessing | |
def get_link(args): | |
# url = "http://m.bing.com/images/more?q="+args[0]+"&ii="+str(args[1])+"&dv=True&form=IGSIGS&IIG=c2a0b6a0c2ab4b179a7c565fa914d169&kval=3.1&AppNs=mSERP" | |
url = "http://m.bing.com/images/q="+args[0]+"&ii="+str(args[1])+"&dv=True" | |
print url | |
soup = get_soup(url) | |
if soup != None: | |
# if no linke retrieved break iteration | |
matches = [a['href'] for a in soup.find_all("a", {"href":re.compile("http:")})] | |
if len(matches) > 0: | |
img_link = matches[0] | |
img_link = img_link.split('?')[0] | |
return img_link | |
else: | |
return None # link has some error | |
else: | |
return None # page gives some error | |
if __name__ == "__main__": | |
chunk_no = 1; | |
output_path = "/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_images/" | |
output_img_paths = '/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_image_urls/' | |
# folders = os.listdir('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/pubfig83/images') | |
# names = [folder.replace('_', ' ') for folder in folders]; | |
f = open('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/all_names.txt','r') | |
names = f.readlines() | |
# name_chunks = chunk(names,5); | |
# names = name_chunks[chunk_no-1] | |
# f = open('/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/Dataset_matlab/all_names.txt','r') | |
# names = f.readlines() | |
LAST_NAME = 'nikolay davydenko' | |
cont_flag = False | |
for name in names: | |
name = name.strip() | |
# folder_name = name.replace(" ","_") | |
if cont_flag: | |
gather_img_files(name, output_path, output_img_paths, 800) | |
if name == LAST_NAME: | |
cont_flag = True | |
for name in names: | |
name = name.strip() | |
full_run(name, output_path, output_img_paths, 800) | |
# full_run('tiger') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment