-
-
Save contrequarte/236a3ad965a4fbd5aefe29b6e2d2201c to your computer and use it in GitHub Desktop.
Download all available audio books from ICE portal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Download all available audio books from DB ICE Portal.""" | |
import json | |
import os | |
import urllib.parse | |
import urllib.request | |
BASE = 'http://iceportal.de/api1/rs/' | |
def load_json(url: str, *, verbose: bool = True): | |
if verbose: | |
print(url) | |
with urllib.request.urlopen(url) as f: | |
doc = json.load(f) | |
return doc | |
def get_page(href: str, *, | |
base: str = urllib.parse.urljoin(BASE, 'page/')): | |
url = urllib.parse.urljoin(base, href.lstrip('/')) | |
return load_json(url, verbose=False) | |
def retrieve(source, target, *, | |
base: str = urllib.parse.urljoin(BASE, 'audiobooks/path/')) -> None: | |
sheet = urllib.parse.urljoin(base, source.lstrip('/')) | |
path = load_json(sheet)['path'] | |
url = urllib.parse.urljoin(base, path) | |
urllib.request.urlretrieve(url, filename=target) | |
audiobooks = get_page('hoerbuecher') | |
for group in audiobooks['teaserGroups']: | |
for item in group['items']: | |
page = get_page(item['navigation']['href']) | |
content_type = page['contentType'] | |
print('', item['title'], sep='\n') | |
print(page['contentType']) | |
dirname = page['title'] | |
# adding serial number for naming podcast files | |
# to avoid stopping after the first episode has been downloaded | |
paths = [[p['serialNumber'], p['path']] for p in page['files']] | |
# fix invalid | |
dirname = dirname.replace('.', '_') | |
for remove_char in ('"', '?', '&', '/', '|'): | |
dirname = dirname.replace(remove_char, '') | |
dirname, _, _ = dirname.partition(':') | |
if not os.path.exists(dirname): | |
os.makedirs(dirname) | |
for url in paths: | |
u = url[1].replace('.',"{0}.".format(url[0])) | |
if content_type == 'podcast': | |
target = os.path.join(dirname, u.rpartition('/')[2]) | |
else: | |
target = os.path.join(dirname, url[1].rpartition('/')[2]) | |
print(target) | |
if not os.path.exists(target): | |
retrieve(url[1], target) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment