Last active
November 5, 2024 21:33
-
-
Save xflr6/759737dc06b290a009352d3307782a2b to your computer and use it in GitHub Desktop.
Download all available audio books from ICE portal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Download all available audio books from DB ICE Portal.""" | |
import json | |
import os | |
import urllib.parse | |
import urllib.request | |
BASE = 'http://iceportal.de/api1/rs/' | |
def load_json(url: str, *, verbose: bool = True): | |
if verbose: | |
print(url) | |
with urllib.request.urlopen(url) as f: | |
doc = json.load(f) | |
return doc | |
def get_page(href: str, *, | |
base: str = urllib.parse.urljoin(BASE, 'page/')): | |
url = urllib.parse.urljoin(base, href.lstrip('/')) | |
return load_json(url) | |
def retrieve(source, target, *, | |
base: str = urllib.parse.urljoin(BASE, 'audiobooks/path/')) -> None: | |
sheet = urllib.parse.urljoin(base, source.lstrip('/')) | |
path = load_json(sheet)['path'] | |
url = urllib.parse.urljoin(base, path) | |
urllib.request.urlretrieve(url, filename=target) | |
audiobooks = get_page('hoerbuecher') | |
for group in audiobooks['teaserGroups']: | |
for item in group['items']: | |
print('', item['title'], sep='\n') | |
page = get_page(item['navigation']['href']) | |
dirname = page['title'] | |
# fix invalid | |
dirname = dirname.replace('.', '_') | |
for remove_char in ('"', '?', '&', '/', '|'): | |
dirname = dirname.replace(remove_char, '') | |
dirname, _, _ = dirname.partition(':') | |
if not os.path.exists(dirname): | |
os.makedirs(dirname) | |
for file in page['files']: | |
url = file['path'] | |
target = os.path.join(dirname, | |
'{:d} - {}'.format(file['serialNumber'], | |
url.rpartition('/')[2])) | |
if not os.path.exists(target): | |
retrieve(url, target) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! i used this script yesterday, it works for quite a while.
But then i saw the behaviour that a file was downloaded, it's size was shrinked to zero, redownloaded, shrinked to zero, redownloaded, .... . It was more or less an endless loop until the wifi connection itself got lost.
Then i debugged and saw that this behaviour was in the line
urllib.request.urlretrieve(url, filename=target)
in theretrieve
-function.Did someone else saw this behaviour and/or has an idea how to stop that?
Could it be that the urlretrieve got a redirect while it's loading, does a redownload, got a redirect, does a redownload and so on?
Is there a parameter for this function which would trigger to ignore such redirects/redownloads, or an other internal function which does more or less the same?
I would be happy if this urlretrieve would throw an expection/returns with an error code if this happens, so script could catch that and download the remaining files.