Created
February 9, 2019 16:23
-
-
Save Nosgoroth/0fab5bf3888e114ab42b6b1dcf4159da to your computer and use it in GitHub Desktop.
Download all chapters of Taishou Otome Otogibanashi to cbz from Sea Otter Scans for archival
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys, zipfile, re, requests, json, tempfile, time | |
from pprint import pprint | |
class Chapter: | |
url = None | |
title = None | |
def __init__(self, tup): self.url, self.title = tup | |
def getImageListForChapterWithUrl(url): | |
#var pages = | |
x = requests.get(url) | |
r = re.search(r'var pages = (\[.*\]);', x.text) | |
if not r: | |
print "Couldn't retrieve image list from page" | |
return None | |
try: | |
pages = json.loads(r.group(1)) | |
except: | |
print "Couldn't parse image list JSON" | |
return None | |
return [x["url"] for x in pages] | |
def getChapterListFromBaseUrl(url): | |
#<a href="https://reader.seaotterscans.com/read/taishau_wotome_otogibanashi/en/0/2/" title="Chapter 2: Memory of Spring on a Winter Night "> | |
x = requests.get(url) | |
# Using regex to parse html. Zalgo is Tony the Pony he COMES | |
r = re.findall(r'<a href="(http[^"]+\/read\/[^"]+)" title="([^"]+)"', x.text) | |
if not r: | |
print "Couldn't retrieve chapter list from page" | |
return None | |
return [Chapter(x) for x in r] | |
def downloadImageListToZip(title, images): | |
tempdir = tempfile.mkdtemp("taishoo") | |
success = True | |
zipname = None | |
try: | |
zipname = re.sub(r'[^\d\w\.\-\_]', ' ', title)+".cbz" | |
zipname = re.sub(r'[\s]+', ' ', zipname) | |
zipname = re.sub(r'[\s]+\.cbz$', '.cbz', zipname) | |
if os.path.exists(zipname): | |
print "File already exists" | |
return False | |
with zipfile.ZipFile(zipname, 'w') as zipobj: | |
i = 0 | |
for imageurl in images: | |
i += 1 | |
filepath = None | |
try: | |
root, ext = os.path.splitext(imageurl) | |
filepath = os.path.join(tempdir,str(i).zfill(3)+ext) | |
r = requests.get(imageurl, allow_redirects=True) | |
open(filepath, 'wb').write(r.content) | |
zipobj.write(filepath) | |
os.remove(filepath) | |
print "Downloaded image", i | |
except KeyboardInterrupt: | |
if filepath: | |
try: os.remove(filepath) | |
except: pass | |
raise | |
except: | |
if filepath: | |
try: os.remove(filepath) | |
except: pass | |
print "Error getting image", i | |
success = False | |
time.sleep(2) | |
except KeyboardInterrupt: | |
if zipname: | |
try: os.remove(zipname) | |
except: pass | |
raise | |
except: | |
if zipname: | |
try: os.remove(zipname) | |
except: pass | |
try: os.rmdir(tempdir) | |
except: pass | |
print "An error ocurred" | |
success = False | |
finally: | |
try: os.rmdir(tempdir) | |
except: pass | |
return success | |
def main(): | |
try: | |
print "Retrieving chapter list..." | |
chapters = getChapterListFromBaseUrl("https://reader.seaotterscans.com/series/taishau_wotome_otogibanashi/") | |
print "Found", len(chapters), "chapters." | |
errors = False | |
for chapter in chapters: | |
print "Processing chapter:", chapter.title | |
images = getImageListForChapterWithUrl(chapter.url) | |
print "Found", len(images), "images." | |
res = downloadImageListToZip(chapter.title, images) | |
if res: | |
print "Chapter downloaded successfully" | |
else: | |
print "Chapter downloaded with errors, or not downloaded" | |
errors = True | |
if errors: | |
print "Finished with errors" | |
else: | |
print "Finished successfully!" | |
except KeyboardInterrupt: | |
pass | |
if __name__ == '__main__': | |
main() |
Hi! Thanks a lot for this script! :) Alas, it fails for me. It hangs for a while (like a minute) on the "Retrieving chapter list" step and then fails with a "certificate verify failed" error message from the Python library. The Sea Otter website is accessible for me in a browser, though. Do you have any idea whether there is a way around this?
I don't really know, but you could try to add the verify = False
parameter to the requests calls, according to Stack Overflow.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! Thanks a lot for this script! :) Alas, it fails for me. It hangs for a while (like a minute) on the "Retrieving chapter list" step and then fails with a "certificate verify failed" error message from the Python library. The Sea Otter website is accessible for me in a browser, though. Do you have any idea whether there is a way around this?
The stack trace is this:
Retrieving chapter list... Traceback (most recent call last): File "taishooo.py", line 125, in <module> main() File "taishooo.py", line 96, in main chapters = getChapterListFromBaseUrl("https://reader.seaotterscans.com/series/taishau_wotome_otogibanashi/") File "taishooo.py", line 28, in getChapterListFromBaseUrl x = requests.get(url) File "/usr/lib/python2.7/dist-packages/requests/api.py", line 55, in get return request('get', url, **kwargs) File "/usr/lib/python2.7/dist-packages/requests/api.py", line 44, in request return session.request(method=method, url=url, **kwargs) File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 467, in request resp = self.send(prep, **send_kwargs) File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 570, in send r = adapter.send(request, **kwargs) File "/usr/lib/python2.7/dist-packages/requests/adapters.py", line 385, in send raise SSLError(e) requests.exceptions.SSLError: [Errno 1] _ssl.c:510: error:14090086:SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify failed