dado3212 · October 12, 2018 04:08
diff --git a/downloadPearls.py b/downloadPearls.py
 import datetime, requests, re, urllib
 from StringIO import StringIO
 from PIL import Image
 import pytesseract
 from cgi import escape
 import json

 base_url = "http://www.gocomics.com/pearlsbeforeswine/"
 start_date = datetime.datetime(2002, 1, 7)
 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

 date = start_date

 comics = []

 def write_to_file(comics):
 	print 'Writing'

 	f = open('index.html', 'wb')
 	f.write('''
 		<html>
 			<head>
 				<style>
 					.comic {
 						width: 750px;
 						margin-bottom: 15px;
 					}

 					.comic img {
 						max-width: 100%;
 					}

 					.comic span {
 						font-weight: bold;
 						font-family: sans-serif;
 						font-size: 1.2em;
 					}
 				</style>
 			</head>
 			<body>

 	''')

 	for comic in comics:
 		f.write('<div class="comic">')
 		f.write('<span>' + comic['date'] + '</span>')
 		f.write('<img src="' + comic['small_url'] + '" />')
 		if 'ocr' in comic:
 			f.write('<p>' + escape(comic['ocr']) + '</p>')
 		f.write('</div>')

 	f.write('</body></html>')
 	f.close()

 	t = open('json.txt', 'wb')
 	t.write(json.dumps(comics))
 	t.close()

 	print 'Done'

 last_url = ""

 cont = True
 try:
 	while cont:
 		try:
 			with requests.Session() as c:
 				comic = c.get(base_url + date.strftime('%Y/%m/%d'), verify=False, headers=headers) # initializes the headers, cookies
 				small_url = re.search('<img alt="Pearls Before Swine" class="strip" src="(.*?)"', comic.text).group(1)
 				try:
 					url = re.search('zoom_link.*?src="(.*?)"', comic.text).group(1)
 				except:
 					url = small_url

 				if (url == last_url):
 					cont = False
 					write_to_file(comics)
 				else:
 					data = urllib.urlopen(url).read()
 					img = Image.open(StringIO(data))
 					
 					comics.append({'url': url, 'small_url': small_url, 'date': date.strftime('%m/%d/%Y'), 'ocr': pytesseract.image_to_string(img)})	
 				last_url = url
 			
 			date = date + datetime.timedelta(days=1)
 			print date.strftime('%Y/%m/%d')
 		except Exception as e:
 			print e
 			cont = False
 except:
 	write_to_file(comics)
	import datetime, requests, re, urllib
	from StringIO import StringIO
	from PIL import Image
	import pytesseract
	from cgi import escape
	import json

	base_url = "http://www.gocomics.com/pearlsbeforeswine/"
	start_date = datetime.datetime(2002, 1, 7)
	headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

	date = start_date

	comics = []

	def write_to_file(comics):
	print 'Writing'

	f = open('index.html', 'wb')
	f.write('''
	<html>
	<head>
	<style>
	.comic {
	width: 750px;
	margin-bottom: 15px;
	}

	.comic img {
	max-width: 100%;
	}

	.comic span {
	font-weight: bold;
	font-family: sans-serif;
	font-size: 1.2em;
	}
	</style>
	</head>
	<body>

	''')

	for comic in comics:
	f.write('<div class="comic">')
	f.write('<span>' + comic['date'] + '</span>')
	f.write('<img src="' + comic['small_url'] + '" />')
	if 'ocr' in comic:
	f.write('<p>' + escape(comic['ocr']) + '</p>')
	f.write('</div>')

	f.write('</body></html>')
	f.close()

	t = open('json.txt', 'wb')
	t.write(json.dumps(comics))
	t.close()

	print 'Done'

	last_url = ""

	cont = True
	try:
	while cont:
	try:
	with requests.Session() as c:
	comic = c.get(base_url + date.strftime('%Y/%m/%d'), verify=False, headers=headers) # initializes the headers, cookies
	small_url = re.search('<img alt="Pearls Before Swine" class="strip" src="(.*?)"', comic.text).group(1)
	try:
	url = re.search('zoom_link.?src="(.?)"', comic.text).group(1)
	except:
	url = small_url

	if (url == last_url):
	cont = False
	write_to_file(comics)
	else:
	data = urllib.urlopen(url).read()
	img = Image.open(StringIO(data))

	comics.append({'url': url, 'small_url': small_url, 'date': date.strftime('%m/%d/%Y'), 'ocr': pytesseract.image_to_string(img)})
	last_url = url

	date = date + datetime.timedelta(days=1)
	print date.strftime('%Y/%m/%d')
	except Exception as e:
	print e
	cont = False
	except:
	write_to_file(comics)