erogol · May 29, 2014 10:07
diff --git a/bing_scrap.py b/bing_scrap.py
 #!/bin/bash
 from bs4 import BeautifulSoup
 import requests
 import urllib2
 import os
 import re, urlparse
 import time
 import pdb
 from interruptingcow import timeout

 import cPickle
 from multiprocessing import Pool
 from subprocess import Popen, PIPE
 from collections import Counter

 '''
 TODO:
  -> Some of the donwloaded images are mal_formatted files. Correct This
  -> Some download links are internal Bing links so Bing pages also downloaded. Correct This
  -> get_link() defines root_url in itself. Give it as an argument
  -> if link list is written before do not scrap again
 '''

 def chunk(l, n):
    if n<1:
        n=1
    return [l[i:i+n] for i in range(0, len(l), n)]

 def run(cmd):
    print '-'*40
    print 'running:', cmd
    p = Popen(cmd, stderr=PIPE, stdout=PIPE, shell=True)
    output, errors = p.communicate()
    print [p.returncode, errors, output]
    if p.returncode or errors:
        print 'something went wrong...'

 def get_soup(url):
 	# some times get gives socket error. 
 	try:
 		r = requests.get(url)
 	except:
 		return None
 	if r.status_code == 200:
 		return BeautifulSoup(r.text)
 	else:
 		return None

 # handle non-ascii characters
 def urlEncodeNonAscii(b):
    return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

 def iriToUri(iri):
    parts= urlparse.urlparse(iri)
    return urlparse.urlunparse(
        part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
        for parti, part in enumerate(parts)
    )

 def full_run(query, output_path, output_img_paths, num_imgs = 800):
 	name = query
 	root_url = 'http://m.bing.com'
 	raw_query = query.replace(' ','_')
 	# query = query.replace(' ','+') + '+face'
 	query = query.replace(' ','+')
 	print query
 	img_link_file_path = output_img_paths+query+'_links.txt'


 	# if link file is exist do not crawle
 	# if os.path.exists(query+'.txt'):

 	# Collect real img links from img show ups of Bing
 	
 	#p = Pool(100)
 	#query_list = zip([query]*num_imgs, range(num_imgs))
 	#img_links = p.map(get_link, query_list)
 	
 	if os.path.exists(img_link_file_path):
 		f = open(img_link_file_path, 'r')
 		img_links = f.readlines()
 		f.close()
 	else:
 		img_links = []
 		bad_img_links = [] # keep malfuncitoning links
 		img_counter = 0
 		raw_counter = -1
 		old_links = Counter()
 		while img_counter < num_imgs:
 			raw_counter += 1

 			print img_counter, ' of query ', query
 			with timeout(20, exception=RuntimeError):
 				try:
 					link = get_link([query, raw_counter])
 				except TimeoutError:
 					print 'Timeout!!!!'
 					img_counter += 1
 					continue

 			# wait!! maybe Bing banned you
 			if link == None:
 				time.sleep(5)
 				img_counter += 1
 				continue

 			try:
 				con = urllib2.urlopen(link)
 			except:
 				bad_img_links.append(link)
 				img_counter += 1
 				continue

 			# if link does not work do not count
 			if con.getcode() != 200:
 				bad_img_links.append(link)
 				img_counter += 1
 				continue
 			
 			print "link ", link
 			# time.sleep(0.5)
 			if old_links[link] == 0:
 				img_links.append(link)
 				img_counter += 1
 				old_links[link]+=1;
 			else:
 				print "Duplicate Link!!"
 				img_counter += 1
 				continue
 			

 		# Save img_links to a file
 		f = open(img_link_file_path, 'w')
 		for img_link in img_links:
 			if img_link != None:
 				f.write("%s\n" % iriToUri(img_link))
 		f.close()

 		# Save bad links
 		f = open(output_img_paths+query+'_bad_inks.txt', 'w')
 		for img_link in bad_img_links:
 			if img_link != None:
 				f.write("%s\n" % iriToUri(img_link))
 		f.close()

 	# Create root class folder if not exists
 	try:
 		fold_path = output_path+raw_query
 		if ~os.path.exists(fold_path):
 			os.makedirs(fold_path)
 	except:
 		pass

 	# pdb.set_trace()
 	for count,img in enumerate(img_links):
 		if img != None:
 			#print img
 			img = str.strip(img)
 			out_path = fold_path+"/"+ str(count) + ".jpg"
 		 	# command = 'wget -O '+out_path+' -o download.log -A.jpeg,.jpg -b ' + iriToUri(img)
 		 	img = iriToUri(img)
 		 	print img, ' to be donwloaded'
 		 	command = 'wget -O '+out_path+' -t 1 -o download.log  --timeout=600 ' + img
 			# os.system(command)
 			run(command)
 			time.sleep(2)
 		else:
 			#print img
 			print 'IS NONE!!!'
 		#raw_img = urllib2.urlopen(img).read()
 		# cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
 		# f = open("images/" + image_type + "_"+ str(count), 'wb')
 		# f.write(raw_img)
 		# f.close()
 	return True

 def gather_img_files(query, output_path, output_img_paths, num_imgs = 800):
 	name = query
 	root_url = 'http://m.bing.com'
 	raw_query = query.replace(' ','_')
 	# query = query.replace(' ','+') + '+face'
 	query = query.replace(' ','+')
 	print query
 	img_link_file_path = output_img_paths+query+'_links.txt'

 	img_links = []
 	bad_img_links = [] # keep malfuncitoning links
 	img_counter = 0
 	raw_counter = -1
 	old_links = Counter()
 	while img_counter < num_imgs:
 		raw_counter += 1
 		try:
 			with timeout(5, exception=RuntimeError):
 				print img_counter, ' of query ', query
 				link = get_link([query, raw_counter])

 				# wait!! maybe Bing banned you
 				if link == None:
 					time.sleep(5)
 					img_counter += 1
 					continue

 				# try:
 				# 	print 'Checking Link!!!'
 				# 	con = urllib2.urlopen(link)
 				# except:
 				# 	bad_img_links.append(link)
 				# 	img_counter += 1
 				# 	continue

 				# # if link does not work do not count
 				# if con.getcode() != 200:
 				# 	bad_img_links.append(link)
 				# 	img_counter += 1
 				# 	continue
 				
 				print "link ", link
 				# time.sleep(0.5)
 				if old_links[link] == 0:
 					img_links.append(link)
 					img_counter += 1
 					old_links[link]+=1;
 				else:
 					print "Duplicate Link!!"
 					img_counter += 1
 					continue
 		except RuntimeError:
 			print 'Timeout!!!!'
 			img_counter += 1
 			continue
 			

 	# Save img_links to a file
 	f = open(img_link_file_path, 'w')
 	for img_link in img_links:
 		if img_link != None:
 			f.write("%s\n" % iriToUri(img_link))
 	f.close()

 	# Save bad links
 	f = open(output_img_paths+query+'_bad_inks.txt', 'w')
 	for img_link in bad_img_links:
 		if img_link != None:
 			f.write("%s\n" % iriToUri(img_link))
 	f.close()


 # slave code for multi-rocessing
 def get_link(args):
 	# url = "http://m.bing.com/images/more?q="+args[0]+"&ii="+str(args[1])+"&dv=True&form=IGSIGS&IIG=c2a0b6a0c2ab4b179a7c565fa914d169&kval=3.1&AppNs=mSERP"
 	url = "http://m.bing.com/images/q="+args[0]+"&ii="+str(args[1])+"&dv=True"
 	print url
 	soup = get_soup(url)
 	if soup != None:
 		# if no linke retrieved break iteration
 		matches = [a['href'] for a in soup.find_all("a", {"href":re.compile("http:")})]
 		if len(matches) > 0:
 			img_link = matches[0]
 			img_link = img_link.split('?')[0]
 			return img_link
 		else:
 			return None # link has some error
 	else:
 		return None # page gives some error


 if __name__ == "__main__":
 	chunk_no = 1;

 	output_path =  "/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_images/"
 	output_img_paths = '/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_image_urls/'
 	# folders = os.listdir('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/pubfig83/images')
 	# names = [folder.replace('_', ' ') for folder in folders];
 	f = open('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/all_names.txt','r')
 	names = f.readlines()
 	# name_chunks = chunk(names,5);
 	# names = name_chunks[chunk_no-1]


 	# f = open('/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/Dataset_matlab/all_names.txt','r')
 	# names = f.readlines()
 	LAST_NAME = 'nikolay davydenko'
 	cont_flag = False
 	for name in names:
 		name = name.strip()
 		# folder_name = name.replace(" ","_")
 		if cont_flag:
 			gather_img_files(name, output_path, output_img_paths, 800)
 		if name == LAST_NAME:
 			cont_flag = True
 	
 	for name in names:
 		name = name.strip()
 		full_run(name, output_path, output_img_paths, 800)
 		
 	# full_run('tiger')
	#!/bin/bash
	from bs4 import BeautifulSoup
	import requests
	import urllib2
	import os
	import re, urlparse
	import time
	import pdb
	from interruptingcow import timeout

	import cPickle
	from multiprocessing import Pool
	from subprocess import Popen, PIPE
	from collections import Counter

	'''
	TODO:
	-> Some of the donwloaded images are mal_formatted files. Correct This
	-> Some download links are internal Bing links so Bing pages also downloaded. Correct This
	-> get_link() defines root_url in itself. Give it as an argument
	-> if link list is written before do not scrap again
	'''

	def chunk(l, n):
	if n<1:
	n=1
	return [l[i:i+n] for i in range(0, len(l), n)]

	def run(cmd):
	print '-'*40
	print 'running:', cmd
	p = Popen(cmd, stderr=PIPE, stdout=PIPE, shell=True)
	output, errors = p.communicate()
	print [p.returncode, errors, output]
	if p.returncode or errors:
	print 'something went wrong...'

	def get_soup(url):
	# some times get gives socket error.
	try:
	r = requests.get(url)
	except:
	return None
	if r.status_code == 200:
	return BeautifulSoup(r.text)
	else:
	return None

	# handle non-ascii characters
	def urlEncodeNonAscii(b):
	return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

	def iriToUri(iri):
	parts= urlparse.urlparse(iri)
	return urlparse.urlunparse(
	part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
	for parti, part in enumerate(parts)
	)

	def full_run(query, output_path, output_img_paths, num_imgs = 800):
	name = query
	root_url = 'http://m.bing.com'
	raw_query = query.replace(' ','_')
	# query = query.replace(' ','+') + '+face'
	query = query.replace(' ','+')
	print query
	img_link_file_path = output_img_paths+query+'_links.txt'


	# if link file is exist do not crawle
	# if os.path.exists(query+'.txt'):

	# Collect real img links from img show ups of Bing

	#p = Pool(100)
	#query_list = zip([query]*num_imgs, range(num_imgs))
	#img_links = p.map(get_link, query_list)

	if os.path.exists(img_link_file_path):
	f = open(img_link_file_path, 'r')
	img_links = f.readlines()
	f.close()
	else:
	img_links = []
	bad_img_links = [] # keep malfuncitoning links
	img_counter = 0
	raw_counter = -1
	old_links = Counter()
	while img_counter < num_imgs:
	raw_counter += 1

	print img_counter, ' of query ', query
	with timeout(20, exception=RuntimeError):
	try:
	link = get_link([query, raw_counter])
	except TimeoutError:
	print 'Timeout!!!!'
	img_counter += 1
	continue

	# wait!! maybe Bing banned you
	if link == None:
	time.sleep(5)
	img_counter += 1
	continue

	try:
	con = urllib2.urlopen(link)
	except:
	bad_img_links.append(link)
	img_counter += 1
	continue

	# if link does not work do not count
	if con.getcode() != 200:
	bad_img_links.append(link)
	img_counter += 1
	continue

	print "link ", link
	# time.sleep(0.5)
	if old_links[link] == 0:
	img_links.append(link)
	img_counter += 1
	old_links[link]+=1;
	else:
	print "Duplicate Link!!"
	img_counter += 1
	continue


	# Save img_links to a file
	f = open(img_link_file_path, 'w')
	for img_link in img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()

	# Save bad links
	f = open(output_img_paths+query+'_bad_inks.txt', 'w')
	for img_link in bad_img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()

	# Create root class folder if not exists
	try:
	fold_path = output_path+raw_query
	if ~os.path.exists(fold_path):
	os.makedirs(fold_path)
	except:
	pass

	# pdb.set_trace()
	for count,img in enumerate(img_links):
	if img != None:
	#print img
	img = str.strip(img)
	out_path = fold_path+"/"+ str(count) + ".jpg"
	# command = 'wget -O '+out_path+' -o download.log -A.jpeg,.jpg -b ' + iriToUri(img)
	img = iriToUri(img)
	print img, ' to be donwloaded'
	command = 'wget -O '+out_path+' -t 1 -o download.log --timeout=600 ' + img
	# os.system(command)
	run(command)
	time.sleep(2)
	else:
	#print img
	print 'IS NONE!!!'
	#raw_img = urllib2.urlopen(img).read()
	# cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
	# f = open("images/" + image_type + "_"+ str(count), 'wb')
	# f.write(raw_img)
	# f.close()
	return True

	def gather_img_files(query, output_path, output_img_paths, num_imgs = 800):
	name = query
	root_url = 'http://m.bing.com'
	raw_query = query.replace(' ','_')
	# query = query.replace(' ','+') + '+face'
	query = query.replace(' ','+')
	print query
	img_link_file_path = output_img_paths+query+'_links.txt'

	img_links = []
	bad_img_links = [] # keep malfuncitoning links
	img_counter = 0
	raw_counter = -1
	old_links = Counter()
	while img_counter < num_imgs:
	raw_counter += 1
	try:
	with timeout(5, exception=RuntimeError):
	print img_counter, ' of query ', query
	link = get_link([query, raw_counter])

	# wait!! maybe Bing banned you
	if link == None:
	time.sleep(5)
	img_counter += 1
	continue

	# try:
	# print 'Checking Link!!!'
	# con = urllib2.urlopen(link)
	# except:
	# bad_img_links.append(link)
	# img_counter += 1
	# continue

	# # if link does not work do not count
	# if con.getcode() != 200:
	# bad_img_links.append(link)
	# img_counter += 1
	# continue

	print "link ", link
	# time.sleep(0.5)
	if old_links[link] == 0:
	img_links.append(link)
	img_counter += 1
	old_links[link]+=1;
	else:
	print "Duplicate Link!!"
	img_counter += 1
	continue
	except RuntimeError:
	print 'Timeout!!!!'
	img_counter += 1
	continue


	# Save img_links to a file
	f = open(img_link_file_path, 'w')
	for img_link in img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()

	# Save bad links
	f = open(output_img_paths+query+'_bad_inks.txt', 'w')
	for img_link in bad_img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()


	# slave code for multi-rocessing
	def get_link(args):
	# url = "http://m.bing.com/images/more?q="+args[0]+"&ii="+str(args[1])+"&dv=True&form=IGSIGS&IIG=c2a0b6a0c2ab4b179a7c565fa914d169&kval=3.1&AppNs=mSERP"
	url = "http://m.bing.com/images/q="+args[0]+"&ii="+str(args[1])+"&dv=True"
	print url
	soup = get_soup(url)
	if soup != None:
	# if no linke retrieved break iteration
	matches = [a['href'] for a in soup.find_all("a", {"href":re.compile("http:")})]
	if len(matches) > 0:
	img_link = matches[0]
	img_link = img_link.split('?')[0]
	return img_link
	else:
	return None # link has some error
	else:
	return None # page gives some error


	if __name__ == "__main__":
	chunk_no = 1;

	output_path = "/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_images/"
	output_img_paths = '/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_image_urls/'
	# folders = os.listdir('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/pubfig83/images')
	# names = [folder.replace('_', ' ') for folder in folders];
	f = open('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/all_names.txt','r')
	names = f.readlines()
	# name_chunks = chunk(names,5);
	# names = name_chunks[chunk_no-1]


	# f = open('/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/Dataset_matlab/all_names.txt','r')
	# names = f.readlines()
	LAST_NAME = 'nikolay davydenko'
	cont_flag = False
	for name in names:
	name = name.strip()
	# folder_name = name.replace(" ","_")
	if cont_flag:
	gather_img_files(name, output_path, output_img_paths, 800)
	if name == LAST_NAME:
	cont_flag = True

	for name in names:
	name = name.strip()
	full_run(name, output_path, output_img_paths, 800)

	# full_run('tiger')