'''
GET PATENT IMAGES
Jeff Thompson | 2017 | jeffreythompson.org

A little Python script that automatically downloads
images associated with a patent listing. Use Google's
Patent Search first, then the "Download (CSV)" button
to save a file with all the results.

'''

import csv, urllib, urllib2, re, os

csv_filename = 'search.csv'   # file to load from
image_folder = 'images'       # folder to save images to (will be created)


print 'extracting urls...'
with open('search.csv') as f:
	f.next()			# skip first line (with search details)
	f.next()			# skip second line (csv header)
	
	data = csv.reader(f, quotechar='"')
	listings = []
	for d in data:
		id =   d[0]
		date = d[7]
		url =  d[8]
		listings.append([id, date, url])
print '- found ' + str(len(listings))


print 'getting images...'
if not os.path.exists(image_folder):
	os.mkdir(image_folder)

for listing in listings:
	id =   listing[0]
	date = listing[1]
	url =  listing[2]
	print '- ' + url
	print '  - downloading page source...'
	
	# use a "user agent" otherwise we'll get blocked by Google :)
	user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
	headers = { 'User-Agent': user_agent }
	req = urllib2.Request(url, None, headers)
	response = urllib2.urlopen(req)
	html = response.read()
	response.close()

	images = re.findall('<meta itemprop="full" content="(.*?)"', html, re.M)
	listing.append(images)
	print '  - downloading ' + str(len(images)) + ' images...'
	for i, image in enumerate(images):
		filename = id + '-' + str(i).zfill(3) + '.png'
		urllib.urlretrieve(image, os.path.join(image_folder, filename))


print 'saving listing data to file...'
with open('listings.csv', 'w') as f:
	f.write('id,date,url,images\n')
	for l in listings:
		f.write(l[0] + ',' + l[1] + ',' + l[2] + ',')
		f.write('"' + ','.join(l[3]) + '"\n')
print '- all done'