''' GET PATENT IMAGES Jeff Thompson | 2017 | jeffreythompson.org A little Python script that automatically downloads images associated with a patent listing. Use Google's Patent Search first, then the "Download (CSV)" button to save a file with all the results. ''' import csv, urllib, urllib2, re, os csv_filename = 'search.csv' # file to load from image_folder = 'images' # folder to save images to (will be created) print 'extracting urls...' with open('search.csv') as f: f.next() # skip first line (with search details) f.next() # skip second line (csv header) data = csv.reader(f, quotechar='"') listings = [] for d in data: id = d[0] date = d[7] url = d[8] listings.append([id, date, url]) print '- found ' + str(len(listings)) print 'getting images...' if not os.path.exists(image_folder): os.mkdir(image_folder) for listing in listings: id = listing[0] date = listing[1] url = listing[2] print '- ' + url print ' - downloading page source...' # use a "user agent" otherwise we'll get blocked by Google :) user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3' headers = { 'User-Agent': user_agent } req = urllib2.Request(url, None, headers) response = urllib2.urlopen(req) html = response.read() response.close() images = re.findall('<meta itemprop="full" content="(.*?)"', html, re.M) listing.append(images) print ' - downloading ' + str(len(images)) + ' images...' for i, image in enumerate(images): filename = id + '-' + str(i).zfill(3) + '.png' urllib.urlretrieve(image, os.path.join(image_folder, filename)) print 'saving listing data to file...' with open('listings.csv', 'w') as f: f.write('id,date,url,images\n') for l in listings: f.write(l[0] + ',' + l[1] + ',' + l[2] + ',') f.write('"' + ','.join(l[3]) + '"\n') print '- all done'