Last active
September 18, 2021 13:25
-
-
Save jmcantrell/697949e48aa5f2bbb4d3e5c10e82035a to your computer and use it in GitHub Desktop.
simple podcast downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -e | |
me=${0##*/} | |
cache=${XDG_CACHE_HOME:-$HOME/.cache}/$me | |
config=${XDG_CONFIG_HOME:-$HOME/.config}/$me | |
urls=$config/urls | |
usage="Update a directory with audio found in RSS/Atom feeds. | |
Usage: $me [-h] [-d dir] [-p dir] [source...] | |
-d directory save audio files to directory | |
-p directory save playlists to directory | |
default: same directory as audio files | |
Where source can be any number of: | |
feed url | |
xml feed file | |
opml subscripion file | |
text file containing feed urls | |
If the source is a text file with urls, this format is expected: | |
<url>\\t[name] | |
The file might have lines like: | |
http://feeds.wnyc.org/radiolab | |
https://onbeing.org/programs/feed/\\tOn Being | |
http://www.ttbook.org/book/radio/rss/feed | |
Examples: | |
# Look at all the ways you can import feeds! | |
podcatch *.opml *.xml | |
podcatch /path/to/urls.txt | |
podcatch https://www.wnyc.org/feeds/shows/otm | |
# By default, running $me will update all feeds in: | |
$urls | |
# Format of above is the same as the text file described earlier. | |
# Audio files will be stored in the current directory. | |
# The directory can be changed with -d or \$PODCASTS. | |
# The playlists similarly with -p or \$PLAYLISTS. | |
# This alias will update all the configured feeds, | |
# putting the audio files in ~/Podcasts/<feed name>/, | |
# and updating a playlist at ~/Playlists/<feed name>.m3u. | |
# Without -p it would be ~/Podcasts/<feed name>/playlist.m3u. | |
alias podcatch='podcatch -d ~/Podcasts -p ~/Playlists' | |
Environment variables: | |
export PODCASTS=/path/to/podcasts | |
export PLAYLISTS=/path/to/playlists | |
" | |
# decode html entities like: & => & | |
unhtml='recode -q html..ascii' | |
# follow redirects and give minimal output | |
curl="curl --fail --location" | |
has() { | |
type -p "$@" &>/dev/null | |
} | |
unset log | |
has logger && log="logger -t '$me'" | |
log() { | |
[[ $log ]] && $log "$@" | |
echo "$@" >&2 | |
} | |
temp() { | |
local temp=$(mktemp -t $me.XXXXXXXXXX) | |
trap "rm -f '$temp'" INT TERM EXIT | |
echo "$temp" | |
} | |
input() { | |
local value=${1:-value} | |
local default=$2 | |
# press enter to accept default | |
local reply | |
read -p "Enter $value${default:+ [$default]}: " reply | |
[[ $reply ]] || reply=$default | |
echo "$reply"; [[ $reply ]] | |
} | |
get() { | |
local url=${1:?missing url} | |
local temp=$(temp) | |
log "Getting $url" | |
if ! $curl -o "$temp" "$url"; then | |
log "Unable to get $url" | |
return 1 | |
fi | |
# should be captured in a var | |
echo "$temp" | |
} | |
head_value() { | |
local url=${1:?missing url} | |
local header=${2:?missing header} | |
$curl --silent --head "$url" | | |
grep "^$header:" | tail -n1 | # ensure only one | |
cut -d' ' -f2- | cut -d';' -f1 # cut out value | |
} | |
content_type() { | |
head_value "$1" "Content-Type" | |
} | |
last_modified() { | |
head_value "$1" "Last-Modified" | |
} | |
get_feed_xml() { | |
local url=${1:?missing url} | |
local name=${2:?missing feed name} | |
local last=$(last_modified "$url") || return 1 | |
local xml=$cache/$name/feed.xml | |
local lastfile=$cache/$name/last | |
if [[ -f $lastfile ]]; then | |
# if values match, no update is needed | |
if ! grep -q -F "$last" "$lastfile"; then | |
local temp=$(get "$url") || return 1 | |
mkdir -p "$(dirname "$xml")" | |
mv -f "$temp" "$xml" | |
fi | |
fi | |
mkdir -p "$(dirname "$lastfile")" | |
echo "$last" >"$lastfile" | |
echo "$xml" | |
} | |
is_url() { | |
egrep -q '^https?://' <<<$1 | |
} | |
is_audio() { | |
egrep -q '^audio/' <<<$1 | |
} | |
xpath() { | |
local xml=${1:?missing xml file} | |
local query=${2:?missing xpath query} | |
xml sel -t -v "$query" -n "$xml" | $unhtml | |
} | |
audio_extension() { | |
local file=${1:?missing audio file} | |
local filetype=$(file --mime-type -b "$file") | |
if [[ ! $filetype ]]; then | |
log "Unable to get type for file $file" | |
return 1 | |
fi | |
if ! is_audio "$filetype"; then | |
log "The file $file is not audio" | |
return 1 | |
fi | |
local ext | |
case ${filetype##audio/} in | |
speex) ext=spx ;; | |
ogg) ext=ogg ;; | |
mpeg) ext=mp3 ;; | |
*) | |
log "Unrecognized file type $filetype" | |
return 1 | |
;; | |
esac | |
echo "$ext" | |
} | |
download() { | |
local url=${1:?missing audio url} | |
local name=${2:?missing feed name} | |
local title=${3:?missing episode title} | |
if ! is_url "$url"; then | |
log "Invalid url $url" | |
return 1 | |
fi | |
local dir=$podcasts/$name | |
local seen=$cache/$name/seen | |
# if this url has been encountered before, skip it | |
grep -q "\b$url\b" "$seen" &>/dev/null && return 0 | |
# hash helps avoid collisions | |
# easy solution for sane filenames | |
local id=$(md5sum <<<$url | cut -f1 -d' ') | |
# check for existing file | |
local audio=$(find "$dir" -name "$id.*") | |
if [[ ! $audio ]]; then | |
# indicate that this url has been encountered | |
mkdir -p "$(dirname "$seen")" | |
echo "$url" >>"$seen" | |
# eliminate non-audio early | |
is_audio "$(content_type "$url")" || return 0 | |
local temp=$(get "$url") || return 1 | |
local ext=$(audio_extension "$temp") || return 1 | |
local audio=$dir/$id.$ext | |
mkdir -p "$dir" | |
mv -f "$temp" "$audio" | |
log "Saved to $audio" | |
# try to avoid getting banned | |
# wait a sec between each download | |
sleep 0.5 | |
fi | |
add_to_m3u "$audio" "$name" "$title" | |
} | |
add_to_m3u() { | |
local audio=${1:?missing audio file} | |
local name=${2:?missing feed name} | |
local title=${3:?missing episode title} | |
# playlist go with the audio or separate? | |
local m3u entry | |
if [[ -d $playlists ]]; then | |
m3u=$playlists/$name.m3u | |
# playlists live apart from audio, so full path | |
entry=$audio | |
else | |
m3u=$podcasts/$name/playlist.m3u | |
# if no playlist dir, no need for full path | |
entry=${audio##*/} | |
fi | |
[[ -f $m3u ]] || echo "#EXTM3U" >"$m3u" | |
if ! grep -q "$entry" "$m3u"; then | |
log "Adding $title to $m3u" | |
echo -e "#EXTINF:0,$name: $title\n$entry" >>"$m3u" | |
fi | |
} | |
update_url() { | |
local url=${1:?missing feed url} | |
local name=${2:?missing feed name} | |
log "Checking $name" | |
local xml=$(get_feed_xml "$url" "$name") || return 1 | |
local guid title | |
xpath "$xml" "//item/guid" | | |
while read -r guid; do | |
title=$(xpath "$xml" "//item[guid='$guid']/title") | |
if [[ ! $title ]]; then | |
log "Item $guid has no title" | |
title=$guid # better than nothing | |
fi | |
xpath "$xml" "//item[guid='$guid']//enclosure/@url" | | |
while read -r url; do | |
# no enclosures for item | |
[[ $url ]] || continue | |
if ! is_url "$url"; then | |
log "Invalid enclosure url $url" | |
continue | |
fi | |
if ! download "$url" "$name" "$title"; then | |
log "Unable to download $url" | |
return 1 | |
fi | |
done | |
done | |
return 0 | |
} | |
import_urls() { | |
local urls=${1:?missing urls file} | |
log "Importing urls file $urls" | |
local file url name | |
grep -v '^ *#' "$urls" | | |
while read -r url name; do | |
import_url "$url" "$name" || return 1 | |
done | |
} | |
import_url() { | |
local url=${1:?missing feed url} | |
local name=$2 | |
log "Importing $url" | |
local temp=$(get "$url") || return 1 | |
# try to get title if it wasn't explicitly set | |
[[ $name ]] || name=$(xpath "$temp" "//channel/title") | |
# ask user to make any changes | |
name=$(input "feed name" "$name") | |
if [[ ! $name ]]; then | |
log "Unable to get feed name for $url" | |
return 1 | |
fi | |
if grep -q "^$url\b" "$urls"; then | |
log "Already imported $url" | |
return 0 | |
fi | |
mkdir -p "$(dirname "$urls")" | |
echo -e "$url\t$name" >>"$urls" | |
} | |
import_opml() { | |
local opml=${1:?missing opml file} | |
log "Importing opml file $opml" | |
local url name | |
xpath "$opml" "//outline/@xmlUrl" | | |
while read -r url; do | |
is_url "$url" || continue | |
name=$(xpath "$opml" "//outline[@xmlUrl='$url']/@text") | |
import_url "$url" "$name" || log "Unable to import $url" | |
done | |
} | |
podcasts=${PODCASTS:-$PWD} | |
playlists=$PLAYlISTS | |
# parse command line options | |
unset query | |
unset OPTIND | |
while getopts ":hd:p:q:" option; do | |
case $option in | |
q) query=$OPTARG ;; | |
d) podcasts=$OPTARG ;; | |
p) playlists=$OPTARG ;; | |
h) echo "$usage" >&2; exit 0 ;; | |
*) echo "$usage" >&2; exit 1 ;; | |
esac | |
done && shift $(($OPTIND - 1)) | |
if (( $# > 0 )); then | |
for arg in "$@"; do | |
if [[ -f $arg ]]; then | |
filetype=$(file --mime-type -bL "$arg") | |
case $filetype in | |
text/*ml) import=import_opml ;; | |
text/plain) import=import_urls ;; | |
*) log "Unusable type $filetype for file $arg" ;; | |
esac | |
else | |
case "$arg" in | |
http*) import=import_url ;; | |
*) log "Unusuable argument $arg" ;; | |
esac | |
fi | |
$import "$arg" || log "Unable to import $arg" | |
done | |
else | |
grep -v '^ *#' "$urls" | | |
while read -r url name; do | |
if [[ $query ]]; then | |
grep -q "$query" <<<"$url $name" || continue | |
fi | |
update_url "$url" "$name" || log "Unable to update $url" | |
done | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment