jmcantrell · September 18, 2021 13:25
diff --git a/podcatch b/podcatch
 #!/usr/bin/env bash

 set -e

 me=${0##*/}

 cache=${XDG_CACHE_HOME:-$HOME/.cache}/$me
 config=${XDG_CONFIG_HOME:-$HOME/.config}/$me

 urls=$config/urls

 usage="Update a directory with audio found in RSS/Atom feeds.
 Usage: $me [-h] [-d dir] [-p dir] [source...]

    -d directory    save audio files to directory
    -p directory    save playlists to directory
                    default: same directory as audio files

 Where source can be any number of:

    feed url
    xml feed file
    opml subscripion file
    text file containing feed urls

 If the source is a text file with urls, this format is expected:

    <url>\\t[name]

 The file might have lines like:

    http://feeds.wnyc.org/radiolab
    https://onbeing.org/programs/feed/\\tOn Being
    http://www.ttbook.org/book/radio/rss/feed

 Examples:

    # Look at all the ways you can import feeds!

    podcatch *.opml *.xml
    podcatch /path/to/urls.txt
    podcatch https://www.wnyc.org/feeds/shows/otm

    # By default, running $me will update all feeds in:

    $urls

    # Format of above is the same as the text file described earlier.

    # Audio files will be stored in the current directory.
    # The directory can be changed with -d or \$PODCASTS.
    # The playlists similarly with -p or \$PLAYLISTS.

    # This alias will update all the configured feeds,
    # putting the audio files in ~/Podcasts/<feed name>/,
    # and updating a playlist at ~/Playlists/<feed name>.m3u.
    # Without -p it would be ~/Podcasts/<feed name>/playlist.m3u.

    alias podcatch='podcatch -d ~/Podcasts -p ~/Playlists'

 Environment variables:

    export PODCASTS=/path/to/podcasts
    export PLAYLISTS=/path/to/playlists
 "

 # decode html entities like: &amp; => &
 unhtml='recode -q html..ascii'

 # follow redirects and give minimal output
 curl="curl --fail --location"

 has() {
    type -p "$@" &>/dev/null
 }

 unset log
 has logger && log="logger -t '$me'"

 log() {
    [[ $log ]] && $log "$@"
    echo "$@" >&2
 }

 temp() {
    local temp=$(mktemp -t $me.XXXXXXXXXX)
    trap "rm -f '$temp'" INT TERM EXIT
    echo "$temp"
 }

 input() {
    local value=${1:-value}
    local default=$2
    # press enter to accept default
    local reply
    read -p "Enter $value${default:+ [$default]}: " reply
    [[ $reply ]] || reply=$default
    echo "$reply"; [[ $reply ]]
 }

 get() {
    local url=${1:?missing url}
    local temp=$(temp)

    log "Getting $url"

    if ! $curl -o "$temp" "$url"; then
        log "Unable to get $url"
        return 1
    fi

    # should be captured in a var
    echo "$temp"
 }

 head_value() {
    local url=${1:?missing url}
    local header=${2:?missing header}
    $curl --silent --head "$url" |
        grep "^$header:" | tail -n1 |   # ensure only one
        cut -d' ' -f2- | cut -d';' -f1  # cut out value
 }

 content_type() {
    head_value "$1" "Content-Type"
 }

 last_modified() {
    head_value "$1" "Last-Modified"
 }

 get_feed_xml() {
    local url=${1:?missing url}
    local name=${2:?missing feed name}

    local last=$(last_modified "$url") || return 1

    local xml=$cache/$name/feed.xml
    local lastfile=$cache/$name/last

    if [[ -f $lastfile ]]; then
        # if values match, no update is needed
        if ! grep -q -F "$last" "$lastfile"; then
            local temp=$(get "$url") || return 1
            mkdir -p "$(dirname "$xml")"
            mv -f "$temp" "$xml"
        fi
    fi

    mkdir -p "$(dirname "$lastfile")"
    echo "$last" >"$lastfile"

    echo "$xml"
 }

 is_url() {
    egrep -q '^https?://' <<<$1
 }

 is_audio() {
    egrep -q '^audio/' <<<$1
 }

 xpath() {
    local xml=${1:?missing xml file}
    local query=${2:?missing xpath query}
    xml sel -t -v "$query" -n "$xml" | $unhtml
 }

 audio_extension() {
    local file=${1:?missing audio file}
    local filetype=$(file --mime-type -b "$file")

    if [[ ! $filetype ]]; then
        log "Unable to get type for file $file"
        return 1
    fi

    if ! is_audio "$filetype"; then
        log "The file $file is not audio"
        return 1
    fi

    local ext
    case ${filetype##audio/} in
        speex) ext=spx ;;
        ogg)   ext=ogg ;;
        mpeg)  ext=mp3 ;;
        *)
            log "Unrecognized file type $filetype"
            return 1
            ;;
    esac
    echo "$ext"
 }

 download() {
    local url=${1:?missing audio url}
    local name=${2:?missing feed name}
    local title=${3:?missing episode title}

    if ! is_url "$url"; then
        log "Invalid url $url"
        return 1
    fi

    local dir=$podcasts/$name
    local seen=$cache/$name/seen

    # if this url has been encountered before, skip it
    grep -q "\b$url\b" "$seen" &>/dev/null && return 0

    # hash helps avoid collisions
    # easy solution for sane filenames
    local id=$(md5sum <<<$url | cut -f1 -d' ')
    # check for existing file
    local audio=$(find "$dir" -name "$id.*")

    if [[ ! $audio ]]; then

        # indicate that this url has been encountered
        mkdir -p "$(dirname "$seen")"
        echo "$url" >>"$seen"

        # eliminate non-audio early
        is_audio "$(content_type "$url")" || return 0

        local temp=$(get "$url") || return 1
        local ext=$(audio_extension "$temp") || return 1
        local audio=$dir/$id.$ext

        mkdir -p "$dir"
        mv -f "$temp" "$audio"

        log "Saved to $audio"

        # try to avoid getting banned
        # wait a sec between each download
        sleep 0.5

    fi

    add_to_m3u "$audio" "$name" "$title"
 }

 add_to_m3u() {
    local audio=${1:?missing audio file}
    local name=${2:?missing feed name}
    local title=${3:?missing episode title}

    # playlist go with the audio or separate?
    local m3u entry
    if [[ -d $playlists ]]; then
        m3u=$playlists/$name.m3u
        # playlists live apart from audio, so full path
        entry=$audio
    else
        m3u=$podcasts/$name/playlist.m3u
        # if no playlist dir, no need for full path
        entry=${audio##*/}
    fi

    [[ -f $m3u ]] || echo "#EXTM3U" >"$m3u"
    if ! grep -q "$entry" "$m3u"; then
        log "Adding $title to $m3u"
        echo -e "#EXTINF:0,$name: $title\n$entry" >>"$m3u"
    fi
 }

 update_url() {
    local url=${1:?missing feed url}
    local name=${2:?missing feed name}

    log "Checking $name"

    local xml=$(get_feed_xml "$url" "$name") || return 1

    local guid title
    xpath "$xml" "//item/guid" |
    while read -r guid; do
        title=$(xpath "$xml" "//item[guid='$guid']/title")
        if [[ ! $title ]]; then
            log "Item $guid has no title"
            title=$guid  # better than nothing
        fi
        xpath "$xml" "//item[guid='$guid']//enclosure/@url" |
        while read -r url; do
            # no enclosures for item
            [[ $url ]] || continue
            if ! is_url "$url"; then
                log "Invalid enclosure url $url"
                continue
            fi
            if ! download "$url" "$name" "$title"; then
                log "Unable to download $url"
                return 1
            fi
        done
    done

    return 0
 }

 import_urls() {
    local urls=${1:?missing urls file}

    log "Importing urls file $urls"

    local file url name
    grep -v '^ *#' "$urls" |
    while read -r url name; do
        import_url "$url" "$name" || return 1
    done
 }

 import_url() {
    local url=${1:?missing feed url}
    local name=$2

    log "Importing $url"

    local temp=$(get "$url") || return 1

    # try to get title if it wasn't explicitly set
    [[ $name ]] || name=$(xpath "$temp" "//channel/title")

    # ask user to make any changes
    name=$(input "feed name" "$name")

    if [[ ! $name ]]; then
        log "Unable to get feed name for $url"
        return 1
    fi

    if grep -q "^$url\b" "$urls"; then
        log "Already imported $url"
        return 0
    fi

    mkdir -p "$(dirname "$urls")"
    echo -e "$url\t$name" >>"$urls"
 }

 import_opml() {
    local opml=${1:?missing opml file}

    log "Importing opml file $opml"

    local url name
    xpath "$opml" "//outline/@xmlUrl" |
    while read -r url; do
        is_url "$url" || continue
        name=$(xpath "$opml" "//outline[@xmlUrl='$url']/@text")
        import_url "$url" "$name" || log "Unable to import $url"
    done
 }

 podcasts=${PODCASTS:-$PWD}
 playlists=$PLAYlISTS

 # parse command line options
 unset query
 unset OPTIND
 while getopts ":hd:p:q:" option; do
    case $option in
        q) query=$OPTARG ;;
        d) podcasts=$OPTARG ;;
        p) playlists=$OPTARG ;;
        h) echo "$usage" >&2; exit 0 ;;
        *) echo "$usage" >&2; exit 1 ;;
    esac
 done && shift $(($OPTIND - 1))

 if (( $# > 0 )); then
    for arg in "$@"; do
        if [[ -f $arg ]]; then
            filetype=$(file --mime-type -bL "$arg")
            case $filetype in
                text/*ml) import=import_opml ;;
                text/plain) import=import_urls ;;
                *) log "Unusable type $filetype for file $arg" ;;
            esac
        else
            case "$arg" in
                http*) import=import_url ;;
                *) log "Unusuable argument $arg" ;;
            esac
        fi
        $import "$arg" || log "Unable to import $arg"
    done
 else
    grep -v '^ *#' "$urls" |
    while read -r url name; do
        if [[ $query ]]; then
            grep -q "$query" <<<"$url $name" || continue
        fi
        update_url "$url" "$name" || log "Unable to update $url"
    done
 fi
	#!/usr/bin/env bash

	set -e

	me=${0##*/}

	cache=${XDG_CACHE_HOME:-$HOME/.cache}/$me
	config=${XDG_CONFIG_HOME:-$HOME/.config}/$me

	urls=$config/urls

	usage="Update a directory with audio found in RSS/Atom feeds.
	Usage: $me [-h] [-d dir] [-p dir] [source...]

	-d directory save audio files to directory
	-p directory save playlists to directory
	default: same directory as audio files

	Where source can be any number of:

	feed url
	xml feed file
	opml subscripion file
	text file containing feed urls

	If the source is a text file with urls, this format is expected:

	<url>\\t[name]

	The file might have lines like:

	http://feeds.wnyc.org/radiolab
	https://onbeing.org/programs/feed/\\tOn Being
	http://www.ttbook.org/book/radio/rss/feed

	Examples:

	# Look at all the ways you can import feeds!

	podcatch .opml .xml
	podcatch /path/to/urls.txt
	podcatch https://www.wnyc.org/feeds/shows/otm

	# By default, running $me will update all feeds in:

	$urls

	# Format of above is the same as the text file described earlier.

	# Audio files will be stored in the current directory.
	# The directory can be changed with -d or \$PODCASTS.
	# The playlists similarly with -p or \$PLAYLISTS.

	# This alias will update all the configured feeds,
	# putting the audio files in ~/Podcasts/<feed name>/,
	# and updating a playlist at ~/Playlists/<feed name>.m3u.
	# Without -p it would be ~/Podcasts/<feed name>/playlist.m3u.

	alias podcatch='podcatch -d ~/Podcasts -p ~/Playlists'

	Environment variables:

	export PODCASTS=/path/to/podcasts
	export PLAYLISTS=/path/to/playlists
	"

	# decode html entities like: & => &
	unhtml='recode -q html..ascii'

	# follow redirects and give minimal output
	curl="curl --fail --location"

	has() {
	type -p "$@" &>/dev/null
	}

	unset log
	has logger && log="logger -t '$me'"

	log() {
	[[ $log ]] && $log "$@"
	echo "$@" >&2
	}

	temp() {
	local temp=$(mktemp -t $me.XXXXXXXXXX)
	trap "rm -f '$temp'" INT TERM EXIT
	echo "$temp"
	}

	input() {
	local value=${1:-value}
	local default=$2
	# press enter to accept default
	local reply
	read -p "Enter $value${default:+ [$default]}: " reply
	[[ $reply ]] \|\| reply=$default
	echo "$reply"; [[ $reply ]]
	}

	get() {
	local url=${1:?missing url}
	local temp=$(temp)

	log "Getting $url"

	if ! $curl -o "$temp" "$url"; then
	log "Unable to get $url"
	return 1
	fi

	# should be captured in a var
	echo "$temp"
	}

	head_value() {
	local url=${1:?missing url}
	local header=${2:?missing header}
	$curl --silent --head "$url" \|
	grep "^$header:" \| tail -n1 \| # ensure only one
	cut -d' ' -f2- \| cut -d';' -f1 # cut out value
	}

	content_type() {
	head_value "$1" "Content-Type"
	}

	last_modified() {
	head_value "$1" "Last-Modified"
	}

	get_feed_xml() {
	local url=${1:?missing url}
	local name=${2:?missing feed name}

	local last=$(last_modified "$url") \|\| return 1

	local xml=$cache/$name/feed.xml
	local lastfile=$cache/$name/last

	if [[ -f $lastfile ]]; then
	# if values match, no update is needed
	if ! grep -q -F "$last" "$lastfile"; then
	local temp=$(get "$url") \|\| return 1
	mkdir -p "$(dirname "$xml")"
	mv -f "$temp" "$xml"
	fi
	fi

	mkdir -p "$(dirname "$lastfile")"
	echo "$last" >"$lastfile"

	echo "$xml"
	}

	is_url() {
	egrep -q '^https?://' <<<$1
	}

	is_audio() {
	egrep -q '^audio/' <<<$1
	}

	xpath() {
	local xml=${1:?missing xml file}
	local query=${2:?missing xpath query}
	xml sel -t -v "$query" -n "$xml" \| $unhtml
	}

	audio_extension() {
	local file=${1:?missing audio file}
	local filetype=$(file --mime-type -b "$file")

	if [[ ! $filetype ]]; then
	log "Unable to get type for file $file"
	return 1
	fi

	if ! is_audio "$filetype"; then
	log "The file $file is not audio"
	return 1
	fi

	local ext
	case ${filetype##audio/} in
	speex) ext=spx ;;
	ogg) ext=ogg ;;
	mpeg) ext=mp3 ;;
	*)
	log "Unrecognized file type $filetype"
	return 1
	;;
	esac
	echo "$ext"
	}

	download() {
	local url=${1:?missing audio url}
	local name=${2:?missing feed name}
	local title=${3:?missing episode title}

	if ! is_url "$url"; then
	log "Invalid url $url"
	return 1
	fi

	local dir=$podcasts/$name
	local seen=$cache/$name/seen

	# if this url has been encountered before, skip it
	grep -q "\b$url\b" "$seen" &>/dev/null && return 0

	# hash helps avoid collisions
	# easy solution for sane filenames
	local id=$(md5sum <<<$url \| cut -f1 -d' ')
	# check for existing file
	local audio=$(find "$dir" -name "$id.*")

	if [[ ! $audio ]]; then

	# indicate that this url has been encountered
	mkdir -p "$(dirname "$seen")"
	echo "$url" >>"$seen"

	# eliminate non-audio early
	is_audio "$(content_type "$url")" \|\| return 0

	local temp=$(get "$url") \|\| return 1
	local ext=$(audio_extension "$temp") \|\| return 1
	local audio=$dir/$id.$ext

	mkdir -p "$dir"
	mv -f "$temp" "$audio"

	log "Saved to $audio"

	# try to avoid getting banned
	# wait a sec between each download
	sleep 0.5

	fi

	add_to_m3u "$audio" "$name" "$title"
	}

	add_to_m3u() {
	local audio=${1:?missing audio file}
	local name=${2:?missing feed name}
	local title=${3:?missing episode title}

	# playlist go with the audio or separate?
	local m3u entry
	if [[ -d $playlists ]]; then
	m3u=$playlists/$name.m3u
	# playlists live apart from audio, so full path
	entry=$audio
	else
	m3u=$podcasts/$name/playlist.m3u
	# if no playlist dir, no need for full path
	entry=${audio##*/}
	fi

	[[ -f $m3u ]] \|\| echo "#EXTM3U" >"$m3u"
	if ! grep -q "$entry" "$m3u"; then
	log "Adding $title to $m3u"
	echo -e "#EXTINF:0,$name: $title\n$entry" >>"$m3u"
	fi
	}

	update_url() {
	local url=${1:?missing feed url}
	local name=${2:?missing feed name}

	log "Checking $name"

	local xml=$(get_feed_xml "$url" "$name") \|\| return 1

	local guid title
	xpath "$xml" "//item/guid" \|
	while read -r guid; do
	title=$(xpath "$xml" "//item[guid='$guid']/title")
	if [[ ! $title ]]; then
	log "Item $guid has no title"
	title=$guid # better than nothing
	fi
	xpath "$xml" "//item[guid='$guid']//enclosure/@url" \|
	while read -r url; do
	# no enclosures for item
	[[ $url ]] \|\| continue
	if ! is_url "$url"; then
	log "Invalid enclosure url $url"
	continue
	fi
	if ! download "$url" "$name" "$title"; then
	log "Unable to download $url"
	return 1
	fi
	done
	done

	return 0
	}

	import_urls() {
	local urls=${1:?missing urls file}

	log "Importing urls file $urls"

	local file url name
	grep -v '^ *#' "$urls" \|
	while read -r url name; do
	import_url "$url" "$name" \|\| return 1
	done
	}

	import_url() {
	local url=${1:?missing feed url}
	local name=$2

	log "Importing $url"

	local temp=$(get "$url") \|\| return 1

	# try to get title if it wasn't explicitly set
	[[ $name ]] \|\| name=$(xpath "$temp" "//channel/title")

	# ask user to make any changes
	name=$(input "feed name" "$name")

	if [[ ! $name ]]; then
	log "Unable to get feed name for $url"
	return 1
	fi

	if grep -q "^$url\b" "$urls"; then
	log "Already imported $url"
	return 0
	fi

	mkdir -p "$(dirname "$urls")"
	echo -e "$url\t$name" >>"$urls"
	}

	import_opml() {
	local opml=${1:?missing opml file}

	log "Importing opml file $opml"

	local url name
	xpath "$opml" "//outline/@xmlUrl" \|
	while read -r url; do
	is_url "$url" \|\| continue
	name=$(xpath "$opml" "//outline[@xmlUrl='$url']/@text")
	import_url "$url" "$name" \|\| log "Unable to import $url"
	done
	}

	podcasts=${PODCASTS:-$PWD}
	playlists=$PLAYlISTS

	# parse command line options
	unset query
	unset OPTIND
	while getopts ":hd:p:q:" option; do
	case $option in
	q) query=$OPTARG ;;
	d) podcasts=$OPTARG ;;
	p) playlists=$OPTARG ;;
	h) echo "$usage" >&2; exit 0 ;;
	*) echo "$usage" >&2; exit 1 ;;
	esac
	done && shift $(($OPTIND - 1))

	if (( $# > 0 )); then
	for arg in "$@"; do
	if [[ -f $arg ]]; then
	filetype=$(file --mime-type -bL "$arg")
	case $filetype in
	text/*ml) import=import_opml ;;
	text/plain) import=import_urls ;;
	*) log "Unusable type $filetype for file $arg" ;;
	esac
	else
	case "$arg" in
	http*) import=import_url ;;
	*) log "Unusuable argument $arg" ;;
	esac
	fi
	$import "$arg" \|\| log "Unable to import $arg"
	done
	else
	grep -v '^ *#' "$urls" \|
	while read -r url name; do
	if [[ $query ]]; then
	grep -q "$query" <<<"$url $name" \|\| continue
	fi
	update_url "$url" "$name" \|\| log "Unable to update $url"
	done
	fi