Last active
September 23, 2021 00:22
-
-
Save ayyybe/85b431586bb98908a8a3d983886c12a5 to your computer and use it in GitHub Desktop.
download & reconstruct a local copy of hosted EPUBs (to download online textbooks, etc.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import shutil | |
import zipfile | |
import tempfile | |
try: | |
import readline # input gets limited to 1024 characters for some reason if this isn't imported (macos only?) | |
except ImportError: | |
pass | |
import urllib.request | |
import xml.etree.ElementTree as ET | |
from threading import Thread | |
# EPUB spec: | |
# https://www.w3.org/publishing/epub3/epub-spec.html | |
# this script is not fully spec compliant and does not support epubs with multiple renditions/root files, and also doesn't check any optional META-INF stuff (encryption.xml, manifest.xml, metadata.xml, rights.xml, signatures.xml) | |
# that being said, i have yet to find any epubs that actually use any of these features | |
def dl(path): | |
print('Downloading ' + path) | |
url = os.path.join(root, path) | |
staging_path = os.path.join(staging, path) | |
os.makedirs(os.path.dirname(staging_path), exist_ok=True) | |
urllib.request.urlretrieve(url, staging_path) | |
return staging_path | |
if __name__ == '__main__': | |
root = input('EPUB Root URL: ') or 'https://education.wiley.com/content/Hughes_Hallett_Calculus_7e/ebook/epub/9781119320494/' | |
cookie = input('Cookie: ') or '' | |
dest = os.path.abspath(input('Destination filename: ') or 'book.epub') | |
staging = tempfile.mkdtemp() | |
opener = urllib.request.build_opener() | |
opener.addheaders = [('cookie', cookie)] | |
urllib.request.install_opener(opener) | |
print('\n====================\n') | |
print('EPUB Root URL: ' + root) | |
print('Cookie: ' + cookie) | |
print('Destination filename: ' + dest) | |
print('Staging directory: ' + staging) | |
input('\nPress any key to begin download') | |
print('') | |
# required mimetype file | |
dl('mimetype') | |
# required container.xml file, also contains path to rendition/rootfile | |
container = ET.parse(dl('META-INF/container.xml')).getroot() | |
rootfile_path = container[0][0].attrib['full-path'] | |
root_dir = os.path.dirname(rootfile_path) | |
print('Found rootfile: ' + rootfile_path) | |
# download rootfile & rip all linked resources | |
rootfile = ET.parse(dl(rootfile_path)).getroot() | |
threads = [] | |
for el in rootfile.iter(): | |
if el.tag.split('}', 1)[1] == 'item': | |
url = os.path.join(root_dir, el.attrib['href']) | |
thread = Thread(target=dl, args=(url,)) | |
threads.append(thread) | |
thread.start() | |
for thread in threads: | |
thread.join() | |
# zip everything up | |
print('Creating epub...') | |
shutil.make_archive(dest, 'zip', staging) | |
os.rename(dest + '.zip', os.path.basename(dest)) | |
# delete staging dir | |
print('Cleaning up...') | |
shutil.rmtree(staging) | |
print('\nDone! EPUB has been reconstructed at ' + dest) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment