Skip to content

Instantly share code, notes, and snippets.

@Nosgoroth
Created February 9, 2019 16:23
Show Gist options
  • Save Nosgoroth/0fab5bf3888e114ab42b6b1dcf4159da to your computer and use it in GitHub Desktop.
Save Nosgoroth/0fab5bf3888e114ab42b6b1dcf4159da to your computer and use it in GitHub Desktop.
Download all chapters of Taishou Otome Otogibanashi to cbz from Sea Otter Scans for archival
import os, sys, zipfile, re, requests, json, tempfile, time
from pprint import pprint
class Chapter:
url = None
title = None
def __init__(self, tup): self.url, self.title = tup
def getImageListForChapterWithUrl(url):
#var pages =
x = requests.get(url)
r = re.search(r'var pages = (\[.*\]);', x.text)
if not r:
print "Couldn't retrieve image list from page"
return None
try:
pages = json.loads(r.group(1))
except:
print "Couldn't parse image list JSON"
return None
return [x["url"] for x in pages]
def getChapterListFromBaseUrl(url):
#<a href="https://reader.seaotterscans.com/read/taishau_wotome_otogibanashi/en/0/2/" title="Chapter 2: Memory of Spring on a Winter Night ">
x = requests.get(url)
# Using regex to parse html. Zalgo is Tony the Pony he COMES
r = re.findall(r'<a href="(http[^"]+\/read\/[^"]+)" title="([^"]+)"', x.text)
if not r:
print "Couldn't retrieve chapter list from page"
return None
return [Chapter(x) for x in r]
def downloadImageListToZip(title, images):
tempdir = tempfile.mkdtemp("taishoo")
success = True
zipname = None
try:
zipname = re.sub(r'[^\d\w\.\-\_]', ' ', title)+".cbz"
zipname = re.sub(r'[\s]+', ' ', zipname)
zipname = re.sub(r'[\s]+\.cbz$', '.cbz', zipname)
if os.path.exists(zipname):
print "File already exists"
return False
with zipfile.ZipFile(zipname, 'w') as zipobj:
i = 0
for imageurl in images:
i += 1
filepath = None
try:
root, ext = os.path.splitext(imageurl)
filepath = os.path.join(tempdir,str(i).zfill(3)+ext)
r = requests.get(imageurl, allow_redirects=True)
open(filepath, 'wb').write(r.content)
zipobj.write(filepath)
os.remove(filepath)
print "Downloaded image", i
except KeyboardInterrupt:
if filepath:
try: os.remove(filepath)
except: pass
raise
except:
if filepath:
try: os.remove(filepath)
except: pass
print "Error getting image", i
success = False
time.sleep(2)
except KeyboardInterrupt:
if zipname:
try: os.remove(zipname)
except: pass
raise
except:
if zipname:
try: os.remove(zipname)
except: pass
try: os.rmdir(tempdir)
except: pass
print "An error ocurred"
success = False
finally:
try: os.rmdir(tempdir)
except: pass
return success
def main():
try:
print "Retrieving chapter list..."
chapters = getChapterListFromBaseUrl("https://reader.seaotterscans.com/series/taishau_wotome_otogibanashi/")
print "Found", len(chapters), "chapters."
errors = False
for chapter in chapters:
print
print "Processing chapter:", chapter.title
images = getImageListForChapterWithUrl(chapter.url)
print "Found", len(images), "images."
res = downloadImageListToZip(chapter.title, images)
if res:
print "Chapter downloaded successfully"
else:
print "Chapter downloaded with errors, or not downloaded"
errors = True
print
print
if errors:
print "Finished with errors"
else:
print "Finished successfully!"
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main()
@SpectrumDT
Copy link

Hi! Thanks a lot for this script! :) Alas, it fails for me. It hangs for a while (like a minute) on the "Retrieving chapter list" step and then fails with a "certificate verify failed" error message from the Python library. The Sea Otter website is accessible for me in a browser, though. Do you have any idea whether there is a way around this?

The stack trace is this:

Retrieving chapter list... Traceback (most recent call last): File "taishooo.py", line 125, in <module> main() File "taishooo.py", line 96, in main chapters = getChapterListFromBaseUrl("https://reader.seaotterscans.com/series/taishau_wotome_otogibanashi/") File "taishooo.py", line 28, in getChapterListFromBaseUrl x = requests.get(url) File "/usr/lib/python2.7/dist-packages/requests/api.py", line 55, in get return request('get', url, **kwargs) File "/usr/lib/python2.7/dist-packages/requests/api.py", line 44, in request return session.request(method=method, url=url, **kwargs) File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 467, in request resp = self.send(prep, **send_kwargs) File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 570, in send r = adapter.send(request, **kwargs) File "/usr/lib/python2.7/dist-packages/requests/adapters.py", line 385, in send raise SSLError(e) requests.exceptions.SSLError: [Errno 1] _ssl.c:510: error:14090086:SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify failed

@Nosgoroth
Copy link
Author

Hi! Thanks a lot for this script! :) Alas, it fails for me. It hangs for a while (like a minute) on the "Retrieving chapter list" step and then fails with a "certificate verify failed" error message from the Python library. The Sea Otter website is accessible for me in a browser, though. Do you have any idea whether there is a way around this?

I don't really know, but you could try to add the verify = False parameter to the requests calls, according to Stack Overflow.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment