Created
October 12, 2018 04:08
-
-
Save dado3212/e0347f38475cc374a91f019381c0568c to your computer and use it in GitHub Desktop.
A download script to download all of the PearlsBeforeSwine comic strips, and run them through OCR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime, requests, re, urllib | |
from StringIO import StringIO | |
from PIL import Image | |
import pytesseract | |
from cgi import escape | |
import json | |
base_url = "http://www.gocomics.com/pearlsbeforeswine/" | |
start_date = datetime.datetime(2002, 1, 7) | |
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} | |
date = start_date | |
comics = [] | |
def write_to_file(comics): | |
print 'Writing' | |
f = open('index.html', 'wb') | |
f.write(''' | |
<html> | |
<head> | |
<style> | |
.comic { | |
width: 750px; | |
margin-bottom: 15px; | |
} | |
.comic img { | |
max-width: 100%; | |
} | |
.comic span { | |
font-weight: bold; | |
font-family: sans-serif; | |
font-size: 1.2em; | |
} | |
</style> | |
</head> | |
<body> | |
''') | |
for comic in comics: | |
f.write('<div class="comic">') | |
f.write('<span>' + comic['date'] + '</span>') | |
f.write('<img src="' + comic['small_url'] + '" />') | |
if 'ocr' in comic: | |
f.write('<p>' + escape(comic['ocr']) + '</p>') | |
f.write('</div>') | |
f.write('</body></html>') | |
f.close() | |
t = open('json.txt', 'wb') | |
t.write(json.dumps(comics)) | |
t.close() | |
print 'Done' | |
last_url = "" | |
cont = True | |
try: | |
while cont: | |
try: | |
with requests.Session() as c: | |
comic = c.get(base_url + date.strftime('%Y/%m/%d'), verify=False, headers=headers) # initializes the headers, cookies | |
small_url = re.search('<img alt="Pearls Before Swine" class="strip" src="(.*?)"', comic.text).group(1) | |
try: | |
url = re.search('zoom_link.*?src="(.*?)"', comic.text).group(1) | |
except: | |
url = small_url | |
if (url == last_url): | |
cont = False | |
write_to_file(comics) | |
else: | |
data = urllib.urlopen(url).read() | |
img = Image.open(StringIO(data)) | |
comics.append({'url': url, 'small_url': small_url, 'date': date.strftime('%m/%d/%Y'), 'ocr': pytesseract.image_to_string(img)}) | |
last_url = url | |
date = date + datetime.timedelta(days=1) | |
print date.strftime('%Y/%m/%d') | |
except Exception as e: | |
print e | |
cont = False | |
except: | |
write_to_file(comics) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment