nrubin · October 2, 2014 23:13
diff --git a/HipsterImages.py b/HipsterImages.py
 #let's download some hipster images
 import requests
 from bs4 import BeautifulSoup
 import numpy as np
 import time
 import os

 def throttle():
 	t = 80*np.random.randn()+200
 	while t < 0:
 		t = 80*np.random.randn()+150
 	print "throttling for %f ms" % (t)
 	time.sleep(t/1000.0)

 def get(url):
 	response = requests.get(url)
 	if response.status_code == 200:
 		return response.text
 	else:
 		return None

 def anchors_with_links(tag):
 	return ((tag.name == "a") and ("photos" in tag['href']))

 def index_photo(photo_links,photo_uri):
 	low_index = photo_uri.find("/photos/") + len("/photos/")
 	photo_id = photo_uri[low_index:-9]
 	full_uri = "https://unsplash.com" + photo_uri
 	pair = (photo_id,full_uri)
 	photo_links.add(pair)

 def parse(photo_links,text):
 	soup = BeautifulSoup(text)
 	anchors = soup.find_all(anchors_with_links)
 	for anchor in anchors:
 		index_photo(photo_links,anchor['href'])

 def file_exists(filename):
 	return os.path.isfile(filename)

 def download_photo(photo_tuple,path):
 	filename = photo_tuple[0] + ".jpg"
 	photo_url = photo_tuple[1]
 	path = "./Images/" + filename
 	if file_exists(path):
 		print "%s already downloaded, skipping..." % (filename)
 	else:
 		r = requests.get(photo_url, stream=True)
 		if r.status_code == 200:
 			with open(path, 'wb') as f:
 				for chunk in r.iter_content(1024):
 					f.write(chunk)

 def mark_page_done(num):
 	filename = ".page-%d" % (num)
 	path = "./Images/"
 	open(path + filename,'a').close()

 def page_done(num):
 	filename = ".page-%d" % (num)
 	path = "./Images/"
 	file_exists(path+filename)

 def get_all_photos():
 	for ind in xrange(1,1000):
 		url = "http://unsplash.com/grid?_=1412274378042&page=%d" % (ind)
 		print "getting url %d" % (ind)
 		text = get(url)
 		if text is not None and not page_done(ind):
 			photo_links = set()
 			parse(photo_links,text)
 			total = len(photo_links)
 			progress = 1
 			for photo in photo_links:
 				download_photo(photo,"")
 				print "downloaded photo %d of %d" % (progress,total)
 				progress += 1
 			mark_page_done(ind)
 		else:
 			print ind
 			return


 if __name__ == '__main__':
 	get_all_photos()
	#let's download some hipster images
	import requests
	from bs4 import BeautifulSoup
	import numpy as np
	import time
	import os

	def throttle():
	t = 80*np.random.randn()+200
	while t < 0:
	t = 80*np.random.randn()+150
	print "throttling for %f ms" % (t)
	time.sleep(t/1000.0)

	def get(url):
	response = requests.get(url)
	if response.status_code == 200:
	return response.text
	else:
	return None

	def anchors_with_links(tag):
	return ((tag.name == "a") and ("photos" in tag['href']))

	def index_photo(photo_links,photo_uri):
	low_index = photo_uri.find("/photos/") + len("/photos/")
	photo_id = photo_uri[low_index:-9]
	full_uri = "https://unsplash.com" + photo_uri
	pair = (photo_id,full_uri)
	photo_links.add(pair)

	def parse(photo_links,text):
	soup = BeautifulSoup(text)
	anchors = soup.find_all(anchors_with_links)
	for anchor in anchors:
	index_photo(photo_links,anchor['href'])

	def file_exists(filename):
	return os.path.isfile(filename)

	def download_photo(photo_tuple,path):
	filename = photo_tuple[0] + ".jpg"
	photo_url = photo_tuple[1]
	path = "./Images/" + filename
	if file_exists(path):
	print "%s already downloaded, skipping..." % (filename)
	else:
	r = requests.get(photo_url, stream=True)
	if r.status_code == 200:
	with open(path, 'wb') as f:
	for chunk in r.iter_content(1024):
	f.write(chunk)

	def mark_page_done(num):
	filename = ".page-%d" % (num)
	path = "./Images/"
	open(path + filename,'a').close()

	def page_done(num):
	filename = ".page-%d" % (num)
	path = "./Images/"
	file_exists(path+filename)

	def get_all_photos():
	for ind in xrange(1,1000):
	url = "http://unsplash.com/grid?_=1412274378042&page=%d" % (ind)
	print "getting url %d" % (ind)
	text = get(url)
	if text is not None and not page_done(ind):
	photo_links = set()
	parse(photo_links,text)
	total = len(photo_links)
	progress = 1
	for photo in photo_links:
	download_photo(photo,"")
	print "downloaded photo %d of %d" % (progress,total)
	progress += 1
	mark_page_done(ind)
	else:
	print ind
	return


	if __name__ == '__main__':
	get_all_photos()