Skip to content

Instantly share code, notes, and snippets.

@trscavo
Last active January 6, 2020 22:54
Show Gist options
  • Save trscavo/5fb0ce26796da9321e84 to your computer and use it in GitHub Desktop.
Save trscavo/5fb0ce26796da9321e84 to your computer and use it in GitHub Desktop.
Shell script to retrieve a web resource via HTTP Conditional GET
#!/bin/bash
#####################################################################
# Retrieve a web resource via HTTP Conditional GET [RFC 7232] and
# output the resource on stdout. Cache the web resource and consult
# the cache on subsequent requests for the same resource. By default,
# return the cached resource (if it exists) if and only if the web
# server responds with 304 Not Modified.
#
# Usage: cget.sh [-vfcH] URL
#
# The default behavior of the script may be modified by using one of
# the following mutually exclusive options:
#
# -f Enables "Force GET Mode"
# -c Enables "Cache Only Mode"
#
# Force GET Mode disables HTTP Conditional GET by excluding the
# relevant request headers (If-Modified-Since and If-None-Match).
# Cache Only Mode bypasses the GET request altogether and goes
# directly to cache. If the resource resides in cache, it is output
# on stdout, otherwise an error is thrown.
#
# This script issues GET requests exclusively. However, the -H option
# causes the response headers to be output on stdout, as though a HEAD
# request were issued. The -H option may be used by itself or in
# conjunction with one of the mode options (-f or -c), in which case
# the response headers are obtained from either from the server or
# from cache, as the case may be.
#
# Examples:
# cget.sh URL # Retrieve the resource using HTTP conditional GET
# cget.sh -H URL # Output the response headers (not the resource)
# cget.sh -f URL # Enable Force GET Mode
# cget.sh -c URL # Enable Cache Only Mode
#
#####################################################################
script_name=${0##*/} # equivalent to basename $0
script_version="0.1"
user_agent_string="cget ${script_version}"
#######################################################################
#
# process command-line options and arguments
#
#######################################################################
verbose_mode=false; output_headers=false
force_get_mode=false; cache_only_mode=false
while getopts ":vHfc" opt; do
case $opt in
v)
verbose_mode=true
;;
H)
output_headers=true
;;
f)
force_get_mode=true
cache_only_mode=false
;;
c)
cache_only_mode=true
force_get_mode=false
;;
\?)
echo "ERROR: $script_name: Unrecognized option: -$OPTARG" >&2
exit 2
;;
esac
done
# determine the location of the web resource
shift $(( OPTIND - 1 ))
if [ $# -ne 1 ]; then
echo "ERROR: $script_name: wrong number of arguments: $# (1 required)" >&2
exit 2
fi
location="$1"
$verbose_mode && printf "$script_name using location URL: %s\n" "$location"
#######################################################################
#
# determine the cache files (which may or may not exist at this point)
#
#######################################################################
hash=$( echo -n "${location}" | /usr/bin/openssl dgst -md5 -hex | cut -d' ' -f2 )
exit_status=$?
if [ $exit_status -ne 0 ]; then
echo "ERROR: ${script_name} failed to hash the location URL" >&2
exit $exit_status
fi
web_cache_dir=/tmp/cache
if [ ! -d "$web_cache_dir" ]; then
mkdir "$web_cache_dir"
exit_status=$?
if [ $exit_status -ne 0 ]; then
echo "ERROR: ${script_name} failed to create dir: $web_cache_dir" >&2
exit $exit_status
fi
fi
cached_header_file="${web_cache_dir}/${hash}_headers"
cached_content_file="${web_cache_dir}/${hash}_content"
if $verbose_mode; then
echo "${script_name} using cached header file: ${cached_header_file}"
echo "${script_name} using cached content file: ${cached_content_file}"
fi
if [ -f "$cached_header_file" ] && [ -f "$cached_content_file" ]; then
if $cache_only_mode; then
$verbose_mode && echo "${script_name} complete."
if $output_headers; then
/bin/cat "${cached_header_file}"
else
/bin/cat "${cached_content_file}"
fi
exit 0
fi
else
# ensure cache integrity
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2
if $cache_only_mode; then
$verbose_mode && echo "${script_name} complete."
echo "ERROR: ${script_name} failed to find cached resource: $location" >&2
exit 1
fi
fi
#######################################################################
#
# initialization
#
#######################################################################
# create a temporary directory
tmp_dir=$( mktemp -d 2>/dev/null || mktemp -d -t "${script_name%.*}" )
if [ ! -d "$tmp_dir" ] ; then
printf "ERROR: Unable to create temporary dir\n" >&2
exit 1
fi
$verbose_mode && printf "$script_name using temp dir: %s\n" "$tmp_dir"
tmp_header_file="$tmp_dir/${script_name%%.*}_tmp_header"
tmp_content_file="$tmp_dir/${script_name%%.*}_tmp_content"
if $verbose_mode; then
echo "${script_name} using temp header file: ${tmp_header_file}"
echo "${script_name} using temp content file: ${tmp_content_file}"
fi
#######################################################################
#
# GET the web resource
#
#######################################################################
# init curl command-line options
curl_opts="--silent --show-error"
curl_opts="${curl_opts} --user-agent '${user_agent_string}'"
curl_opts="${curl_opts} --dump-header '${tmp_header_file}'"
curl_opts="${curl_opts} --output '${tmp_content_file}'"
# add conditional GET headers (if possible)
conditional_get=false
if [ -f "$cached_header_file" ] && [ -f "$cached_content_file" ]; then
if ! $force_get_mode; then
header_value=$( /bin/cat $cached_header_file \
| grep -F 'Last-Modified' | /usr/bin/tr -d "\r" \
| sed -e 's/^[^:]*: [ ]*//' -e 's/[ ]*$//'
)
if [ -n "$header_value" ]; then
conditional_get=true
curl_opts="${curl_opts} --header 'If-Modified-Since: $header_value'"
fi
header_value=$( /bin/cat $cached_header_file \
| grep -F 'ETag' | /usr/bin/tr -d "\r" \
| sed -e 's/^[^:]*: [ ]*//' -e 's/[ ]*$//'
)
if [ -n "$header_value" ]; then
conditional_get=true
curl_opts="${curl_opts} --header 'If-None-Match: $header_value'"
fi
fi
fi
# invoke curl
cmd="/usr/bin/curl $curl_opts $location"
$verbose_mode && printf "${script_name} issuing curl command: %s\n" "$cmd"
eval $cmd
exit_status=$?
if [ $exit_status -ne 0 ]; then
echo "ERROR: ${script_name} failed" >&2
exit $exit_status
fi
if [ ! -f "$tmp_header_file" ]; then
echo "ERROR: ${script_name} unable to find header file ${tmp_header_file}" >&2
exit 1
fi
if $verbose_mode; then
echo "${script_name} received the following response:"
/bin/cat "${tmp_header_file}" | while read line; do
echo "> $line"
done
fi
#######################################################################
#
# process the response
#
#######################################################################
response_code=$( /bin/cat "$tmp_header_file" | /usr/bin/head -1 \
| sed -e 's/^[^ ]* \([^ ]*\) .*$/\1/'
)
$verbose_mode && printf "${script_name} received response code: %d\n" "$response_code"
if [ "$response_code" = "200" ]; then
# sanity check
declared_content_length=$( /bin/cat "$tmp_header_file" \
| grep -F 'Content-Length' | /usr/bin/tr -d "\r" \
| sed -e 's/^[^:]*: [ ]*//' -e 's/[ ]*$//'
)
actual_content_length=$( /bin/cat $tmp_content_file | /usr/bin/wc -c \
| sed -e 's/^[ ]*//' -e 's/[ ]*$//'
)
if [ -n "$declared_content_length" ]; then
if [ "$declared_content_length" != "$actual_content_length" ]; then
echo "ERROR: ${script_name} failed content length check" >&2
exit 1
fi
else
echo "WARNING: Content-Length response header missing" >&2
fi
if $verbose_mode; then
echo "${script_name} downloaded ${actual_content_length} bytes"
if $conditional_get; then
echo "${script_name} refreshing cache files"
else
echo "${script_name} initializing cache files"
fi
fi
# update the cache; maintain cache integrity at all times
/bin/cp -f "$tmp_header_file" "$cached_header_file" >&2
exit_status=$?
if [ $exit_status -ne 0 ]; then
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2
echo "ERROR: ${script_name} failed copy to file ${cached_header_file}" >&2
exit $exit_status
fi
/bin/cp -f "$tmp_content_file" "$cached_content_file" >&2
exit_status=$?
if [ $exit_status -ne 0 ]; then
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2
echo "ERROR: ${script_name} failed copy to file ${cached_content_file}" >&2
exit $exit_status
fi
elif [ "$response_code" = "304" ]; then
if $verbose_mode; then
echo "${script_name} downloaded 0 bytes (cache is up-to-date)"
fi
else
echo "ERROR: ${script_name} failed with HTTP response code ${response_code}" >&2
exit 1
fi
#######################################################################
#
# output the resource content
#
#######################################################################
if [ -f "$cached_header_file" ] && [ -f "$cached_content_file" ]; then
$verbose_mode && echo "${script_name} complete."
if $output_headers; then
/bin/cat "${tmp_header_file}"
else
/bin/cat "${cached_content_file}"
fi
exit 0
else
# ensure cache integrity
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2
echo "ERROR: ${script_name} failed to get metadata" >&2
exit 1
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment