woky · September 2, 2021 15:44
diff --git a/.podcasts.lst b/.podcasts.lst
 # https://thehistoryofrome.typepad.com/ # finished
 #https://feeds.feedburner.com/TheHistoryOfRome

 # https://thehistoryofrome.typepad.com/revolutions_podcast/
 https://revolutionspodcast.libsyn.com/rss

 # https://thehistoryofbyzantium.com/
 https://rss.acast.com/thehistoryofbyzantium

 # https://darknetdiaries.com/
 https://feeds.megaphone.fm/darknetdiaries

 # https://hubermanlab.libsyn.com/
 https://hubermanlab.libsyn.com/rss

 # https://routingtable.cloud/
 https://anchor.fm/s/1a3cf0b8/podcast/rss

 # http://www.astronomycast.com/
 https://astronomycast.libsyn.com/rss

 # https://americanbiography.webs.com/
 #https://rss.acast.com/americanbiography

 # http://ethnopolis.co.uk/
 #https://historyofyugoslavia.libsyn.com/rss

 # https://historyofenglishpodcast.com/
 #https://historyofenglishpodcast.com/feed/podcast/

 # https://therealmiddleages.com/
 #https://therealmiddleages.libsyn.com/rss

 # https://thehistoryofvikings.com/ # Noath Tetzner
 #https://feeds.captivate.fm/thehistoryofvikings/

 # http://podcast.storiesofthesecondworldwar.com/ # Noath Tetzner
 #https://feeds.acast.com/public/shows/stories-of-the-second-world-war

 # https://corecursive.com/
 https://corecursive.libsyn.com/feed

 # https://www.acast.com/historyofthepapacy
 #https://rss.acast.com/historyofthepapacy

 # https://thegreatwarpodcast.podbean.com/ # finished
 #https://feed.podbean.com/thegreatwarpodcast/feed.xml

 # https://barrystrauss.com/podcast/
 #https://antiquitas.castos.com/feed

 # https://adspthepodcast.com/
 https://feeds.buzzsprout.com/1501960.rss

 # https://www.arraycast.com/
 https://www.arraycast.com/episodes?format=rss

 # https://handmade.network/podcast
 #https://handmade.network/podcast/podcast.xml

 # https://wondery.com/shows/tides-of-history/
 https://rss.art19.com/tides-of-history
diff --git a/podcastdl b/podcastdl
 #!/usr/bin/env python3

 # Dependencies:
 #   apt install python3-lxml python3-requests python3-dateutil
 #   pacman -S python-lxml python-requests python-dateutil
 #   pip install lxml requests python-dateutil

 import argparse
 import sys, os, os.path, re
 import fnmatch
 from pathlib import Path
 from urllib.parse import urlparse

 import dateutil.parser
 import requests
 from lxml import etree, html

 DEFAULT_OUTPUT_DIR = '/mnt/storage/podcasts'
 DEFAULT_LIST_FILE = os.path.expanduser('~/.podcasts.lst')
 RSS_PODCAST_URLS = (
    # These are unused now because the regex below matches them all
    'feeds.feedburner.com/*',
    'rss.acast.com/*',
    'feeds.acast.com/public/shows/*',
    '*.libsyn.com/rss',
    'feeds.megaphone.fm/*',
    'anchor.fm/s/*/podcast/rss',
    'feeds.captivate.fm/*',
 )

 argp = argparse.ArgumentParser()
 argp.add_argument('url', nargs='*')
 argp.add_argument('--output-dir', '-o', type=str, default=DEFAULT_OUTPUT_DIR)
 argp.add_argument('--dry-run', '-n', action='store_true')
 argp.add_argument('--verbose', '-v', action='store_true')
 argp.add_argument('--print-full-path', '-P', action='store_true')

 args = argp.parse_args()
 url_args = args.url
 if not url_args:
    if not os.path.exists(DEFAULT_LIST_FILE):
        argp.print_usage()
        sys.exit(1)
    url_args = ['@' + DEFAULT_LIST_FILE]
 urls = []
 for arg in url_args:
    if arg.startswith('@'):
        with open(arg[1:], 'r') as f:
            for line in f:
                line = re.sub('#.*', '', line)
                line = line.strip()
                if line:
                    urls.append(line)
    else:
        urls.append(arg)

 def text2filename(text: str):
    name = re.sub('[^0-9A-Za-z]+', '_', text)
    name = re.sub('^_|_$', '', name)
    return name

 top_urls = set(urls)
 added_urls = set(top_urls)
 while urls:
    rss_url, urls = urls[0], urls[1:]
    if args.verbose:
        print('<<', rss_url)
    try:
        rss_resp = requests.get(rss_url, timeout=15)
        rss_resp.raise_for_status()
    except OSError as e:
        print(rss_url, 'failed to fetch rss/html:', repr(e), file=sys.stderr)
        continue
    ctype = rss_resp.headers['content-type']
    ctype = re.sub(';.*', '', ctype).strip()
    if ctype == 'text/html':
        if rss_url not in top_urls:
            continue
        tree = html.fromstring(rss_resp.text)
        i = 0
        def insert_url(new_url):
            global i
            urls.insert(i, new_url)
            i += 1
            added_urls.add(new_url)
        for new_url in tree.xpath('//link[@type="application/rss+xml"]/@href'):
            if new_url not in added_urls:
                insert_url(new_url)
        for new_url in tree.xpath('//a/@href'):
            if new_url in added_urls:
                continue
            if re.search(r'(?i)\b(?:rss|feeds?)\b', new_url):
                insert_url(new_url)
                continue
            _url = urlparse(new_url)
            _url = re.sub(':.*', '', _url.netloc) + _url.path
            for pattern in RSS_PODCAST_URLS:
                if fnmatch.fnmatch(_url, pattern):
                    insert_url(new_url)
                    break
    elif ctype in ('application/rss+xml', 'text/xml', 'application/xml'):
        tree = etree.fromstring(rss_resp.content)
        if tree.tag != 'rss':
            continue
        for chan in tree.xpath('/rss/channel'):
            chan_title = chan.xpath('title/text()')
            if not chan_title:
                print(rss_url, 'channel has no title', file=sys.stderr)
                continue
            chan_title = text2filename(chan_title[0])
            chan_dir = os.path.join(args.output_dir, chan_title)
            for item in chan.xpath('item')[::-1]:
                item_title = item.xpath('title/text()')
                if not item_title:
                    print(rss_url, 'item has no title', file=sys.stderr)
                    continue
                item_title = text2filename(item_title[0])
                item_time = item.xpath('pubDate/text()')
                if not item_time:
                    print(rss_url, 'item has no pubDate', file=sys.stderr)
                    continue
                try:
                    item_time = dateutil.parser.parse(item_time[0])
                except ValueError:
                    print(rss_url, 'item has invalid pubDate', item_time[0], file=sys.stderr)
                    continue
                item_time = item_time.strftime('%Y-%m-%d')
                for enclosure in item.xpath('enclosure[starts-with(@type, "audio/")]'):
                    media_type = enclosure.get('type')
                    if media_type == 'audio/mpeg':
                        ext = '.mp3'
                    else:
                        continue
                    break
                else:
                    continue
                media_url = enclosure.get('url')
                media_file = os.path.join(chan_dir, item_time + '-' + item_title + ext)
                if os.path.exists(media_file):
                    continue
                if args.print_full_path:
                    print(media_file)
                else:
                    print(os.path.relpath(media_file, args.output_dir))
                if args.dry_run:
                    continue
                if not os.path.isdir(chan_dir):
                    os.mkdir(chan_dir)
                media_file_tmp = media_file + '.tmp'
                try:
                    media_resp = requests.get(media_url, stream=True, timeout=15)
                    media_resp.raise_for_status()
                    with open(media_file_tmp, 'wb') as f:
                        for chunk in media_resp.iter_content(chunk_size=None):
                            f.write(chunk)
                except OSError as e:
                    print(rss_url, f'failed to download {media_url}:', repr(e), file=sys.stderr)
                    if os.path.exists(media_file_tmp):
                        os.unlink(media_file_tmp)
                    continue
                os.rename(media_file_tmp, media_file)
diff --git a/podcastdl.crontab b/podcastdl.crontab
 0 4 * * * 	systemd-cat -t podcastdl ~/podcastdl
	# https://thehistoryofrome.typepad.com/ # finished
	#https://feeds.feedburner.com/TheHistoryOfRome

	# https://thehistoryofrome.typepad.com/revolutions_podcast/
	https://revolutionspodcast.libsyn.com/rss

	# https://thehistoryofbyzantium.com/
	https://rss.acast.com/thehistoryofbyzantium

	# https://darknetdiaries.com/
	https://feeds.megaphone.fm/darknetdiaries

	# https://hubermanlab.libsyn.com/
	https://hubermanlab.libsyn.com/rss

	# https://routingtable.cloud/
	https://anchor.fm/s/1a3cf0b8/podcast/rss

	# http://www.astronomycast.com/
	https://astronomycast.libsyn.com/rss

	# https://americanbiography.webs.com/
	#https://rss.acast.com/americanbiography

	# http://ethnopolis.co.uk/
	#https://historyofyugoslavia.libsyn.com/rss

	# https://historyofenglishpodcast.com/
	#https://historyofenglishpodcast.com/feed/podcast/

	# https://therealmiddleages.com/
	#https://therealmiddleages.libsyn.com/rss

	# https://thehistoryofvikings.com/ # Noath Tetzner
	#https://feeds.captivate.fm/thehistoryofvikings/

	# http://podcast.storiesofthesecondworldwar.com/ # Noath Tetzner
	#https://feeds.acast.com/public/shows/stories-of-the-second-world-war

	# https://corecursive.com/
	https://corecursive.libsyn.com/feed

	# https://www.acast.com/historyofthepapacy
	#https://rss.acast.com/historyofthepapacy

	# https://thegreatwarpodcast.podbean.com/ # finished
	#https://feed.podbean.com/thegreatwarpodcast/feed.xml

	# https://barrystrauss.com/podcast/
	#https://antiquitas.castos.com/feed

	# https://adspthepodcast.com/
	https://feeds.buzzsprout.com/1501960.rss

	# https://www.arraycast.com/
	https://www.arraycast.com/episodes?format=rss

	# https://handmade.network/podcast
	#https://handmade.network/podcast/podcast.xml

	# https://wondery.com/shows/tides-of-history/
	https://rss.art19.com/tides-of-history
	#!/usr/bin/env python3

	# Dependencies:
	# apt install python3-lxml python3-requests python3-dateutil
	# pacman -S python-lxml python-requests python-dateutil
	# pip install lxml requests python-dateutil

	import argparse
	import sys, os, os.path, re
	import fnmatch
	from pathlib import Path
	from urllib.parse import urlparse

	import dateutil.parser
	import requests
	from lxml import etree, html

	DEFAULT_OUTPUT_DIR = '/mnt/storage/podcasts'
	DEFAULT_LIST_FILE = os.path.expanduser('~/.podcasts.lst')
	RSS_PODCAST_URLS = (
	# These are unused now because the regex below matches them all
	'feeds.feedburner.com/*',
	'rss.acast.com/*',
	'feeds.acast.com/public/shows/*',
	'*.libsyn.com/rss',
	'feeds.megaphone.fm/*',
	'anchor.fm/s/*/podcast/rss',
	'feeds.captivate.fm/*',
	)

	argp = argparse.ArgumentParser()
	argp.add_argument('url', nargs='*')
	argp.add_argument('--output-dir', '-o', type=str, default=DEFAULT_OUTPUT_DIR)
	argp.add_argument('--dry-run', '-n', action='store_true')
	argp.add_argument('--verbose', '-v', action='store_true')
	argp.add_argument('--print-full-path', '-P', action='store_true')

	args = argp.parse_args()
	url_args = args.url
	if not url_args:
	if not os.path.exists(DEFAULT_LIST_FILE):
	argp.print_usage()
	sys.exit(1)
	url_args = ['@' + DEFAULT_LIST_FILE]
	urls = []
	for arg in url_args:
	if arg.startswith('@'):
	with open(arg[1:], 'r') as f:
	for line in f:
	line = re.sub('#.*', '', line)
	line = line.strip()
	if line:
	urls.append(line)
	else:
	urls.append(arg)

	def text2filename(text: str):
	name = re.sub('[^0-9A-Za-z]+', '_', text)
	name = re.sub('^_\|_$', '', name)
	return name

	top_urls = set(urls)
	added_urls = set(top_urls)
	while urls:
	rss_url, urls = urls[0], urls[1:]
	if args.verbose:
	print('<<', rss_url)
	try:
	rss_resp = requests.get(rss_url, timeout=15)
	rss_resp.raise_for_status()
	except OSError as e:
	print(rss_url, 'failed to fetch rss/html:', repr(e), file=sys.stderr)
	continue
	ctype = rss_resp.headers['content-type']
	ctype = re.sub(';.*', '', ctype).strip()
	if ctype == 'text/html':
	if rss_url not in top_urls:
	continue
	tree = html.fromstring(rss_resp.text)
	i = 0
	def insert_url(new_url):
	global i
	urls.insert(i, new_url)
	i += 1
	added_urls.add(new_url)
	for new_url in tree.xpath('//link[@type="application/rss+xml"]/@href'):
	if new_url not in added_urls:
	insert_url(new_url)
	for new_url in tree.xpath('//a/@href'):
	if new_url in added_urls:
	continue
	if re.search(r'(?i)\b(?:rss\|feeds?)\b', new_url):
	insert_url(new_url)
	continue
	_url = urlparse(new_url)
	_url = re.sub(':.*', '', _url.netloc) + _url.path
	for pattern in RSS_PODCAST_URLS:
	if fnmatch.fnmatch(_url, pattern):
	insert_url(new_url)
	break
	elif ctype in ('application/rss+xml', 'text/xml', 'application/xml'):
	tree = etree.fromstring(rss_resp.content)
	if tree.tag != 'rss':
	continue
	for chan in tree.xpath('/rss/channel'):
	chan_title = chan.xpath('title/text()')
	if not chan_title:
	print(rss_url, 'channel has no title', file=sys.stderr)
	continue
	chan_title = text2filename(chan_title[0])
	chan_dir = os.path.join(args.output_dir, chan_title)
	for item in chan.xpath('item')[::-1]:
	item_title = item.xpath('title/text()')
	if not item_title:
	print(rss_url, 'item has no title', file=sys.stderr)
	continue
	item_title = text2filename(item_title[0])
	item_time = item.xpath('pubDate/text()')
	if not item_time:
	print(rss_url, 'item has no pubDate', file=sys.stderr)
	continue
	try:
	item_time = dateutil.parser.parse(item_time[0])
	except ValueError:
	print(rss_url, 'item has invalid pubDate', item_time[0], file=sys.stderr)
	continue
	item_time = item_time.strftime('%Y-%m-%d')
	for enclosure in item.xpath('enclosure[starts-with(@type, "audio/")]'):
	media_type = enclosure.get('type')
	if media_type == 'audio/mpeg':
	ext = '.mp3'
	else:
	continue
	break
	else:
	continue
	media_url = enclosure.get('url')
	media_file = os.path.join(chan_dir, item_time + '-' + item_title + ext)
	if os.path.exists(media_file):
	continue
	if args.print_full_path:
	print(media_file)
	else:
	print(os.path.relpath(media_file, args.output_dir))
	if args.dry_run:
	continue
	if not os.path.isdir(chan_dir):
	os.mkdir(chan_dir)
	media_file_tmp = media_file + '.tmp'
	try:
	media_resp = requests.get(media_url, stream=True, timeout=15)
	media_resp.raise_for_status()
	with open(media_file_tmp, 'wb') as f:
	for chunk in media_resp.iter_content(chunk_size=None):
	f.write(chunk)
	except OSError as e:
	print(rss_url, f'failed to download {media_url}:', repr(e), file=sys.stderr)
	if os.path.exists(media_file_tmp):
	os.unlink(media_file_tmp)
	continue
	os.rename(media_file_tmp, media_file)