Last active
September 2, 2021 15:44
-
-
Save woky/9ba28b53556ffbe138fcf9e9082dc17e to your computer and use it in GitHub Desktop.
podcastdl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://thehistoryofrome.typepad.com/ # finished | |
#https://feeds.feedburner.com/TheHistoryOfRome | |
# https://thehistoryofrome.typepad.com/revolutions_podcast/ | |
https://revolutionspodcast.libsyn.com/rss | |
# https://thehistoryofbyzantium.com/ | |
https://rss.acast.com/thehistoryofbyzantium | |
# https://darknetdiaries.com/ | |
https://feeds.megaphone.fm/darknetdiaries | |
# https://hubermanlab.libsyn.com/ | |
https://hubermanlab.libsyn.com/rss | |
# https://routingtable.cloud/ | |
https://anchor.fm/s/1a3cf0b8/podcast/rss | |
# http://www.astronomycast.com/ | |
https://astronomycast.libsyn.com/rss | |
# https://americanbiography.webs.com/ | |
#https://rss.acast.com/americanbiography | |
# http://ethnopolis.co.uk/ | |
#https://historyofyugoslavia.libsyn.com/rss | |
# https://historyofenglishpodcast.com/ | |
#https://historyofenglishpodcast.com/feed/podcast/ | |
# https://therealmiddleages.com/ | |
#https://therealmiddleages.libsyn.com/rss | |
# https://thehistoryofvikings.com/ # Noath Tetzner | |
#https://feeds.captivate.fm/thehistoryofvikings/ | |
# http://podcast.storiesofthesecondworldwar.com/ # Noath Tetzner | |
#https://feeds.acast.com/public/shows/stories-of-the-second-world-war | |
# https://corecursive.com/ | |
https://corecursive.libsyn.com/feed | |
# https://www.acast.com/historyofthepapacy | |
#https://rss.acast.com/historyofthepapacy | |
# https://thegreatwarpodcast.podbean.com/ # finished | |
#https://feed.podbean.com/thegreatwarpodcast/feed.xml | |
# https://barrystrauss.com/podcast/ | |
#https://antiquitas.castos.com/feed | |
# https://adspthepodcast.com/ | |
https://feeds.buzzsprout.com/1501960.rss | |
# https://www.arraycast.com/ | |
https://www.arraycast.com/episodes?format=rss | |
# https://handmade.network/podcast | |
#https://handmade.network/podcast/podcast.xml | |
# https://wondery.com/shows/tides-of-history/ | |
https://rss.art19.com/tides-of-history |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Dependencies: | |
# apt install python3-lxml python3-requests python3-dateutil | |
# pacman -S python-lxml python-requests python-dateutil | |
# pip install lxml requests python-dateutil | |
import argparse | |
import sys, os, os.path, re | |
import fnmatch | |
from pathlib import Path | |
from urllib.parse import urlparse | |
import dateutil.parser | |
import requests | |
from lxml import etree, html | |
DEFAULT_OUTPUT_DIR = '/mnt/storage/podcasts' | |
DEFAULT_LIST_FILE = os.path.expanduser('~/.podcasts.lst') | |
RSS_PODCAST_URLS = ( | |
# These are unused now because the regex below matches them all | |
'feeds.feedburner.com/*', | |
'rss.acast.com/*', | |
'feeds.acast.com/public/shows/*', | |
'*.libsyn.com/rss', | |
'feeds.megaphone.fm/*', | |
'anchor.fm/s/*/podcast/rss', | |
'feeds.captivate.fm/*', | |
) | |
argp = argparse.ArgumentParser() | |
argp.add_argument('url', nargs='*') | |
argp.add_argument('--output-dir', '-o', type=str, default=DEFAULT_OUTPUT_DIR) | |
argp.add_argument('--dry-run', '-n', action='store_true') | |
argp.add_argument('--verbose', '-v', action='store_true') | |
argp.add_argument('--print-full-path', '-P', action='store_true') | |
args = argp.parse_args() | |
url_args = args.url | |
if not url_args: | |
if not os.path.exists(DEFAULT_LIST_FILE): | |
argp.print_usage() | |
sys.exit(1) | |
url_args = ['@' + DEFAULT_LIST_FILE] | |
urls = [] | |
for arg in url_args: | |
if arg.startswith('@'): | |
with open(arg[1:], 'r') as f: | |
for line in f: | |
line = re.sub('#.*', '', line) | |
line = line.strip() | |
if line: | |
urls.append(line) | |
else: | |
urls.append(arg) | |
def text2filename(text: str): | |
name = re.sub('[^0-9A-Za-z]+', '_', text) | |
name = re.sub('^_|_$', '', name) | |
return name | |
top_urls = set(urls) | |
added_urls = set(top_urls) | |
while urls: | |
rss_url, urls = urls[0], urls[1:] | |
if args.verbose: | |
print('<<', rss_url) | |
try: | |
rss_resp = requests.get(rss_url, timeout=15) | |
rss_resp.raise_for_status() | |
except OSError as e: | |
print(rss_url, 'failed to fetch rss/html:', repr(e), file=sys.stderr) | |
continue | |
ctype = rss_resp.headers['content-type'] | |
ctype = re.sub(';.*', '', ctype).strip() | |
if ctype == 'text/html': | |
if rss_url not in top_urls: | |
continue | |
tree = html.fromstring(rss_resp.text) | |
i = 0 | |
def insert_url(new_url): | |
global i | |
urls.insert(i, new_url) | |
i += 1 | |
added_urls.add(new_url) | |
for new_url in tree.xpath('//link[@type="application/rss+xml"]/@href'): | |
if new_url not in added_urls: | |
insert_url(new_url) | |
for new_url in tree.xpath('//a/@href'): | |
if new_url in added_urls: | |
continue | |
if re.search(r'(?i)\b(?:rss|feeds?)\b', new_url): | |
insert_url(new_url) | |
continue | |
_url = urlparse(new_url) | |
_url = re.sub(':.*', '', _url.netloc) + _url.path | |
for pattern in RSS_PODCAST_URLS: | |
if fnmatch.fnmatch(_url, pattern): | |
insert_url(new_url) | |
break | |
elif ctype in ('application/rss+xml', 'text/xml', 'application/xml'): | |
tree = etree.fromstring(rss_resp.content) | |
if tree.tag != 'rss': | |
continue | |
for chan in tree.xpath('/rss/channel'): | |
chan_title = chan.xpath('title/text()') | |
if not chan_title: | |
print(rss_url, 'channel has no title', file=sys.stderr) | |
continue | |
chan_title = text2filename(chan_title[0]) | |
chan_dir = os.path.join(args.output_dir, chan_title) | |
for item in chan.xpath('item')[::-1]: | |
item_title = item.xpath('title/text()') | |
if not item_title: | |
print(rss_url, 'item has no title', file=sys.stderr) | |
continue | |
item_title = text2filename(item_title[0]) | |
item_time = item.xpath('pubDate/text()') | |
if not item_time: | |
print(rss_url, 'item has no pubDate', file=sys.stderr) | |
continue | |
try: | |
item_time = dateutil.parser.parse(item_time[0]) | |
except ValueError: | |
print(rss_url, 'item has invalid pubDate', item_time[0], file=sys.stderr) | |
continue | |
item_time = item_time.strftime('%Y-%m-%d') | |
for enclosure in item.xpath('enclosure[starts-with(@type, "audio/")]'): | |
media_type = enclosure.get('type') | |
if media_type == 'audio/mpeg': | |
ext = '.mp3' | |
else: | |
continue | |
break | |
else: | |
continue | |
media_url = enclosure.get('url') | |
media_file = os.path.join(chan_dir, item_time + '-' + item_title + ext) | |
if os.path.exists(media_file): | |
continue | |
if args.print_full_path: | |
print(media_file) | |
else: | |
print(os.path.relpath(media_file, args.output_dir)) | |
if args.dry_run: | |
continue | |
if not os.path.isdir(chan_dir): | |
os.mkdir(chan_dir) | |
media_file_tmp = media_file + '.tmp' | |
try: | |
media_resp = requests.get(media_url, stream=True, timeout=15) | |
media_resp.raise_for_status() | |
with open(media_file_tmp, 'wb') as f: | |
for chunk in media_resp.iter_content(chunk_size=None): | |
f.write(chunk) | |
except OSError as e: | |
print(rss_url, f'failed to download {media_url}:', repr(e), file=sys.stderr) | |
if os.path.exists(media_file_tmp): | |
os.unlink(media_file_tmp) | |
continue | |
os.rename(media_file_tmp, media_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0 4 * * * systemd-cat -t podcastdl ~/podcastdl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment