marek-saji · October 13, 2013 17:37
diff --git a/get.sh b/get.sh
 #!/bin/bash
 # (c)2012, Marek `saji` Augustynowicz
 # Licensed under MIT License, http://marek-saji.mit-license.org
 #
 # Pass URL(s) as script's parameter(s). If no parameters passed,
 # script seeks for `urls` file in current directory and read input
 # from there. If no `urls` is to be found, read from stdin.
 #
 # Script tries to interpret lines as URLs and inteligently downloads them
 # (using youtube-dl(1), where it makes sense). If input line does not look like
 # a URL (does not contain protocol), line will be evaluated.
 # Lines containing URLs are removed after sucessufull download.

 YOUTUBEDL_RETRIES=3

 SCRIPTNAME="$( basename "$0" .sh )"

 ##
 # printf an error message
 #
 # Params:
 # - same as printf(1)
 errorf ()
 {
    FMT=$1
    shift
    printf "\e[31mERROR\e[m $FMT\n" "$@" 1>&2
 }

 ##
 # printf a success message
 #
 # Params:
 # - same as printf(1)
 successf ()
 {
    FMT=$1
    shift
    printf "\e[32mSUCCESS\e[m $FMT\n" "$@"
 }

 ##
 # printf a info message
 #
 # Params:
 # - same as printf(1)
 infof ()
 {
    FMT=$1
    shift
    printf "\e[33mINFO\e[m $FMT\n" "$@"
 }

 ##
 # printf a header
 #
 # Params:
 # - same as printf(1)
 headerf ()
 {
    FMT=$1
    shift
    printf "\e[30m\e[46m★ $FMT\e[m\n" "$@"
 }


 # call the script recusrsively for each argument
 if [ "$#" -gt "1" ]
 then
    STATUS=0
    for FILE in "$@"
    do
        "$0" "$FILE"
        THIS_STATUS=$?
        if [ "$THIS_STATUS" -ne "0" ]
        then
            STATUS="$THIS_STATUS"
            #break
        fi
    done
    exit $STATUS
 fi

 ##
 # Set screen(1) title
 #
 # Params
 # - $1: Optional additional title
 #
 # Uses:
 # - $PWD
 settitle ()
 {
    # set screen(1) window name
    if [ "$TERM" == "screen" ]
    then
        TITLE="($( dirs +0 ))"
        if [ -n "$1" ]
        then
            TITLE="$1 $TITLE"
        fi
        echo -ne "\ek$SCRIPTNAME $TITLE\e\\"
    fi
 }


 ##
 # Main function
 #
 # Params:
 # - $1: URL to download
 #
 # Uses:
 # - $FILE path to file, where URL came from, optional
 # - $BROKEN_URLS to store incorrect URLs
 get ()
 {
    LINE="$1"
    headerf "%s" "$LINE"
    settitle


    # ignore empty lines and comments
    if [ -z "$LINE" ] || [ "#" = "${LINE:0:1}" ]
    then
        return
    fi

    # evaluate lines that do not look like url (start with a protocol)
    if ( echo "$LINE" | grep -vPq "^[a-z]+://" )
    then
        infof "Evaluated."
        eval "${LINE}"
        return
    fi


    # check whether there is enough free space (at least 1G)
    DF=$( /bin/df . | tail -n1 | awk -F\  '{ print $4; }' )
    DF_LIMIT=$(( 1 * 1024 * 1024 ))
    if [ $DF -lt $DF_LIMIT ]
    then
        errorf "Less than 1G free space."
        df -h .
        exit 3
    fi


    # handle google's malware-detecting url
    if ( echo "$LINE" | grep -Pq '^ *(https?://)?(www\.)?google.[a-z.]+/url\?' )
    then
        URL="$( curl --silent "$LINE" | grep -oP "[a-z]+://[^'\"]+" | tail -n1 )"
        infof "Resolved as \`%s'" "$URL"
    else
        URL="$LINE"
    fi

    # rtmp stream
    if ( echo "$LINE" | grep -Pq '^ *rtmp://' )
    then
        FILENAME="$( basename "$LINE" )"
        infof "It's a RTMP stream. Will download to “%s”." "$FILENAME"
        rtmpdump -r "$LINE" > "$FILENAME"
        return
    fi

    HEADERS="$( curl --head --silent --location $CURL_HEAD_OPTIONS "$URL" )"

    # non-200s
    if ! ( echo "$HEADERS" | grep -q '^HTTP.*200' )
    then
        if ( echo "$HEADERS" | grep -q '^HTTP' )
        then
            errorf "Got %s" "$( echo "$HEADERS" | grep '^HTTP' )"
        else
            errorf "Unable to connect?"
        fi
        BROKEN_URLS="$BROKEN_URLS\n$URL"
        return
    fi


    MIME="$( echo "$HEADERS" | grep '^Content-Type' | tail -n1 | cut -d\  -f2 | cut -d\; -f1 | sed -re 's/^\s+|\s+$//g' )"

    if [ "$MIME" = "text/html" ]
    then
        TITLE="$( youtube-dl --get-title --max-download=1 "$URL" 2>/dev/null )"
        echo $TITLE
        if [ -n "$TITLE" ]
        then
            infof "It's a “%s” video" "$TITLE"
            settitle "$TITLE"

            for STRIKE in $( seq $YOUTUBEDL_RETRIES )
            do
                if ( youtube-dl --continue --console-title --restrict-filenames --output "%(uploader)s - %(title)s %(id)s.%(ext)s" $YOUTUBEDL_OPTIONS "$URL" )
                then
                    completed "$LINE" "$FILE"
                    break
                elif [ $STRIKE -eq $YOUTUBEDL_RETRIES ]
                then
                    infof "Strike %d, you are out" $STRIKE
                    errorf "Unable to download the video"
                    BROKEN_URLS="$BROKEN_URLS\n$URL"
                else
                    infof "Strike %d, retrying…" $STRIKE
                fi
            done
        else
            DIR="$( echo "$URL" | sed -e 's/\// /g' )"
            infof "It's “%s”. Will download to “%s” directory." "$MIME" "$DIR"
            mkdir "$DIR"
            cd "$DIR"
            if ( wget --continue --convert-links --no-check-certificate --page-requisites $WGET_RECURSIVE_OPTIONS "$URL" )
            then
                completed "$LINE" "$FILE"
            fi
            cd -
        fi
    else
        if [ -n "$MIME" ]
        then
            infof "It's “%s”. Will download the file." "$MIME"
            settitle "$( basename "$LINE" )"
        else
            errorf "Got empty MIME type. ☹  Will try to download the file anyway."
        fi

        if ( curl --location --remote-name --remote-header-name -C - $CURL_OPTIONS "$URL" )
        then
            completed "$LINE" "$FILE"
        elif [ -z "$MIME" ]
        then
            BROKEN_URLS="$BROKEN_URLS\n$URL"
        fi
    fi

    printf "\n"
 }


 ##
 # Called, when url is fully retrieved
 #
 # Params:
 # - $1: URL of a file
 # - $2: path to a file URL is taken from, optional
 completed ()
 {
    successf "DONE"
    URL="$1"
    FILE="$2"
    if [ -n "$FILE" ]
    then
        TMP="$( mktemp )"
        grep -v "$URL" "$FILE" > "$TMP"
        OLD_WC=$( wc -l "$FILE" | cut -d\  -f1 )
        EXPECTED_WC=$(( OLD_WC - 1 ))
        NEW_WC=$( wc -l "$TMP" | cut -d\  -f1 )
        if [ "$EXPECTED_WC" != "$NEW_WC" ]
        then
            errorf "Something happended while removing url from queue file"
        fi
        cp "$FILE" "$FILE~"
        mv "$TMP" "$FILE"
    fi
 }


 # include configuration file
 if [ -f ".getshrc" ]
 then
    . .getshrc
 fi


 # determine URLs source
 if [ -n "$1" ]
 then
    if ( echo "$1" | grep -qP '^[a-z]+://' )
    then
        get "$1"
        exit $?
    else
        if [ -d "$1" ]
        then
            cd "$1"
            FILE="./urls"
        else
            cd "$( dirname "$1" )"
            FILE="$( basename "$1" )"
        fi
        if [ -f .getshrc ]
        then
            . .getshrc
        fi
    fi
 elif [ -f "./urls" ]
 then
    FILE="./urls"
 else
    FILE=/dev/stdin
    infof "Reading from standard input.\n"
 fi


 # main loop
 BROKEN_URLS=""
 infof "Reading from \`%s', %d lines" "$FILE" "$( wc -l "$FILE" | cut -d\  -f1 )"
 cat "$FILE" | while read URL
 do
    get "$URL" "$FILE"
    infof "%s files left in the queue" "$( wc -l "$FILE" )"
 done

 if [ -n "$BROKEN_URLS" ]
 then
    errorf "\nFOUND BROKEN URLs:$BROKEN_URLS\n"
 fi
	#!/bin/bash
	# (c)2012, Marek `saji` Augustynowicz
	# Licensed under MIT License, http://marek-saji.mit-license.org
	#
	# Pass URL(s) as script's parameter(s). If no parameters passed,
	# script seeks for `urls` file in current directory and read input
	# from there. If no `urls` is to be found, read from stdin.
	#
	# Script tries to interpret lines as URLs and inteligently downloads them
	# (using youtube-dl(1), where it makes sense). If input line does not look like
	# a URL (does not contain protocol), line will be evaluated.
	# Lines containing URLs are removed after sucessufull download.

	YOUTUBEDL_RETRIES=3

	SCRIPTNAME="$( basename "$0" .sh )"

	##
	# printf an error message
	#
	# Params:
	# - same as printf(1)
	errorf ()
	{
	FMT=$1
	shift
	printf "\e[31mERROR\e[m $FMT\n" "$@" 1>&2
	}

	##
	# printf a success message
	#
	# Params:
	# - same as printf(1)
	successf ()
	{
	FMT=$1
	shift
	printf "\e[32mSUCCESS\e[m $FMT\n" "$@"
	}

	##
	# printf a info message
	#
	# Params:
	# - same as printf(1)
	infof ()
	{
	FMT=$1
	shift
	printf "\e[33mINFO\e[m $FMT\n" "$@"
	}

	##
	# printf a header
	#
	# Params:
	# - same as printf(1)
	headerf ()
	{
	FMT=$1
	shift
	printf "\e[30m\e[46m★ $FMT\e[m\n" "$@"
	}


	# call the script recusrsively for each argument
	if [ "$#" -gt "1" ]
	then
	STATUS=0
	for FILE in "$@"
	do
	"$0" "$FILE"
	THIS_STATUS=$?
	if [ "$THIS_STATUS" -ne "0" ]
	then
	STATUS="$THIS_STATUS"
	#break
	fi
	done
	exit $STATUS
	fi

	##
	# Set screen(1) title
	#
	# Params
	# - $1: Optional additional title
	#
	# Uses:
	# - $PWD
	settitle ()
	{
	# set screen(1) window name
	if [ "$TERM" == "screen" ]
	then
	TITLE="($( dirs +0 ))"
	if [ -n "$1" ]
	then
	TITLE="$1 $TITLE"
	fi
	echo -ne "\ek$SCRIPTNAME $TITLE\e\\"
	fi
	}


	##
	# Main function
	#
	# Params:
	# - $1: URL to download
	#
	# Uses:
	# - $FILE path to file, where URL came from, optional
	# - $BROKEN_URLS to store incorrect URLs
	get ()
	{
	LINE="$1"
	headerf "%s" "$LINE"
	settitle


	# ignore empty lines and comments
	if [ -z "$LINE" ] \|\| [ "#" = "${LINE:0:1}" ]
	then
	return
	fi

	# evaluate lines that do not look like url (start with a protocol)
	if ( echo "$LINE" \| grep -vPq "^[a-z]+://" )
	then
	infof "Evaluated."
	eval "${LINE}"
	return
	fi


	# check whether there is enough free space (at least 1G)
	DF=$( /bin/df . \| tail -n1 \| awk -F\ '{ print $4; }' )
	DF_LIMIT=$(( 1 * 1024 * 1024 ))
	if [ $DF -lt $DF_LIMIT ]
	then
	errorf "Less than 1G free space."
	df -h .
	exit 3
	fi


	# handle google's malware-detecting url
	if ( echo "$LINE" \| grep -Pq '^ *(https?://)?(www\.)?google.[a-z.]+/url\?' )
	then
	URL="$( curl --silent "$LINE" \| grep -oP "[a-z]+://[^'\"]+" \| tail -n1 )"
	infof "Resolved as \`%s'" "$URL"
	else
	URL="$LINE"
	fi

	# rtmp stream
	if ( echo "$LINE" \| grep -Pq '^ *rtmp://' )
	then
	FILENAME="$( basename "$LINE" )"
	infof "It's a RTMP stream. Will download to “%s”." "$FILENAME"
	rtmpdump -r "$LINE" > "$FILENAME"
	return
	fi

	HEADERS="$( curl --head --silent --location $CURL_HEAD_OPTIONS "$URL" )"

	# non-200s
	if ! ( echo "$HEADERS" \| grep -q '^HTTP.*200' )
	then
	if ( echo "$HEADERS" \| grep -q '^HTTP' )
	then
	errorf "Got %s" "$( echo "$HEADERS" \| grep '^HTTP' )"
	else
	errorf "Unable to connect?"
	fi
	BROKEN_URLS="$BROKEN_URLS\n$URL"
	return
	fi


	MIME="$( echo "$HEADERS" \| grep '^Content-Type' \| tail -n1 \| cut -d\ -f2 \| cut -d\; -f1 \| sed -re 's/^\s+\|\s+$//g' )"

	if [ "$MIME" = "text/html" ]
	then
	TITLE="$( youtube-dl --get-title --max-download=1 "$URL" 2>/dev/null )"
	echo $TITLE
	if [ -n "$TITLE" ]
	then
	infof "It's a “%s” video" "$TITLE"
	settitle "$TITLE"

	for STRIKE in $( seq $YOUTUBEDL_RETRIES )
	do
	if ( youtube-dl --continue --console-title --restrict-filenames --output "%(uploader)s - %(title)s %(id)s.%(ext)s" $YOUTUBEDL_OPTIONS "$URL" )
	then
	completed "$LINE" "$FILE"
	break
	elif [ $STRIKE -eq $YOUTUBEDL_RETRIES ]
	then
	infof "Strike %d, you are out" $STRIKE
	errorf "Unable to download the video"
	BROKEN_URLS="$BROKEN_URLS\n$URL"
	else
	infof "Strike %d, retrying…" $STRIKE
	fi
	done
	else
	DIR="$( echo "$URL" \| sed -e 's/\// /g' )"
	infof "It's “%s”. Will download to “%s” directory." "$MIME" "$DIR"
	mkdir "$DIR"
	cd "$DIR"
	if ( wget --continue --convert-links --no-check-certificate --page-requisites $WGET_RECURSIVE_OPTIONS "$URL" )
	then
	completed "$LINE" "$FILE"
	fi
	cd -
	fi
	else
	if [ -n "$MIME" ]
	then
	infof "It's “%s”. Will download the file." "$MIME"
	settitle "$( basename "$LINE" )"
	else
	errorf "Got empty MIME type. ☹ Will try to download the file anyway."
	fi

	if ( curl --location --remote-name --remote-header-name -C - $CURL_OPTIONS "$URL" )
	then
	completed "$LINE" "$FILE"
	elif [ -z "$MIME" ]
	then
	BROKEN_URLS="$BROKEN_URLS\n$URL"
	fi
	fi

	printf "\n"
	}


	##
	# Called, when url is fully retrieved
	#
	# Params:
	# - $1: URL of a file
	# - $2: path to a file URL is taken from, optional
	completed ()
	{
	successf "DONE"
	URL="$1"
	FILE="$2"
	if [ -n "$FILE" ]
	then
	TMP="$( mktemp )"
	grep -v "$URL" "$FILE" > "$TMP"
	OLD_WC=$( wc -l "$FILE" \| cut -d\ -f1 )
	EXPECTED_WC=$(( OLD_WC - 1 ))
	NEW_WC=$( wc -l "$TMP" \| cut -d\ -f1 )
	if [ "$EXPECTED_WC" != "$NEW_WC" ]
	then
	errorf "Something happended while removing url from queue file"
	fi
	cp "$FILE" "$FILE~"
	mv "$TMP" "$FILE"
	fi
	}


	# include configuration file
	if [ -f ".getshrc" ]
	then
	. .getshrc
	fi


	# determine URLs source
	if [ -n "$1" ]
	then
	if ( echo "$1" \| grep -qP '^[a-z]+://' )
	then
	get "$1"
	exit $?
	else
	if [ -d "$1" ]
	then
	cd "$1"
	FILE="./urls"
	else
	cd "$( dirname "$1" )"
	FILE="$( basename "$1" )"
	fi
	if [ -f .getshrc ]
	then
	. .getshrc
	fi
	fi
	elif [ -f "./urls" ]
	then
	FILE="./urls"
	else
	FILE=/dev/stdin
	infof "Reading from standard input.\n"
	fi


	# main loop
	BROKEN_URLS=""
	infof "Reading from \`%s', %d lines" "$FILE" "$( wc -l "$FILE" \| cut -d\ -f1 )"
	cat "$FILE" \| while read URL
	do
	get "$URL" "$FILE"
	infof "%s files left in the queue" "$( wc -l "$FILE" )"
	done

	if [ -n "$BROKEN_URLS" ]
	then
	errorf "\nFOUND BROKEN URLs:$BROKEN_URLS\n"
	fi
No results found