Last active
January 6, 2020 22:54
-
-
Save trscavo/5fb0ce26796da9321e84 to your computer and use it in GitHub Desktop.
Shell script to retrieve a web resource via HTTP Conditional GET
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
##################################################################### | |
# Retrieve a web resource via HTTP Conditional GET [RFC 7232] and | |
# output the resource on stdout. Cache the web resource and consult | |
# the cache on subsequent requests for the same resource. By default, | |
# return the cached resource (if it exists) if and only if the web | |
# server responds with 304 Not Modified. | |
# | |
# Usage: cget.sh [-vfcH] URL | |
# | |
# The default behavior of the script may be modified by using one of | |
# the following mutually exclusive options: | |
# | |
# -f Enables "Force GET Mode" | |
# -c Enables "Cache Only Mode" | |
# | |
# Force GET Mode disables HTTP Conditional GET by excluding the | |
# relevant request headers (If-Modified-Since and If-None-Match). | |
# Cache Only Mode bypasses the GET request altogether and goes | |
# directly to cache. If the resource resides in cache, it is output | |
# on stdout, otherwise an error is thrown. | |
# | |
# This script issues GET requests exclusively. However, the -H option | |
# causes the response headers to be output on stdout, as though a HEAD | |
# request were issued. The -H option may be used by itself or in | |
# conjunction with one of the mode options (-f or -c), in which case | |
# the response headers are obtained from either from the server or | |
# from cache, as the case may be. | |
# | |
# Examples: | |
# cget.sh URL # Retrieve the resource using HTTP conditional GET | |
# cget.sh -H URL # Output the response headers (not the resource) | |
# cget.sh -f URL # Enable Force GET Mode | |
# cget.sh -c URL # Enable Cache Only Mode | |
# | |
##################################################################### | |
script_name=${0##*/} # equivalent to basename $0 | |
script_version="0.1" | |
user_agent_string="cget ${script_version}" | |
####################################################################### | |
# | |
# process command-line options and arguments | |
# | |
####################################################################### | |
verbose_mode=false; output_headers=false | |
force_get_mode=false; cache_only_mode=false | |
while getopts ":vHfc" opt; do | |
case $opt in | |
v) | |
verbose_mode=true | |
;; | |
H) | |
output_headers=true | |
;; | |
f) | |
force_get_mode=true | |
cache_only_mode=false | |
;; | |
c) | |
cache_only_mode=true | |
force_get_mode=false | |
;; | |
\?) | |
echo "ERROR: $script_name: Unrecognized option: -$OPTARG" >&2 | |
exit 2 | |
;; | |
esac | |
done | |
# determine the location of the web resource | |
shift $(( OPTIND - 1 )) | |
if [ $# -ne 1 ]; then | |
echo "ERROR: $script_name: wrong number of arguments: $# (1 required)" >&2 | |
exit 2 | |
fi | |
location="$1" | |
$verbose_mode && printf "$script_name using location URL: %s\n" "$location" | |
####################################################################### | |
# | |
# determine the cache files (which may or may not exist at this point) | |
# | |
####################################################################### | |
hash=$( echo -n "${location}" | /usr/bin/openssl dgst -md5 -hex | cut -d' ' -f2 ) | |
exit_status=$? | |
if [ $exit_status -ne 0 ]; then | |
echo "ERROR: ${script_name} failed to hash the location URL" >&2 | |
exit $exit_status | |
fi | |
web_cache_dir=/tmp/cache | |
if [ ! -d "$web_cache_dir" ]; then | |
mkdir "$web_cache_dir" | |
exit_status=$? | |
if [ $exit_status -ne 0 ]; then | |
echo "ERROR: ${script_name} failed to create dir: $web_cache_dir" >&2 | |
exit $exit_status | |
fi | |
fi | |
cached_header_file="${web_cache_dir}/${hash}_headers" | |
cached_content_file="${web_cache_dir}/${hash}_content" | |
if $verbose_mode; then | |
echo "${script_name} using cached header file: ${cached_header_file}" | |
echo "${script_name} using cached content file: ${cached_content_file}" | |
fi | |
if [ -f "$cached_header_file" ] && [ -f "$cached_content_file" ]; then | |
if $cache_only_mode; then | |
$verbose_mode && echo "${script_name} complete." | |
if $output_headers; then | |
/bin/cat "${cached_header_file}" | |
else | |
/bin/cat "${cached_content_file}" | |
fi | |
exit 0 | |
fi | |
else | |
# ensure cache integrity | |
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2 | |
if $cache_only_mode; then | |
$verbose_mode && echo "${script_name} complete." | |
echo "ERROR: ${script_name} failed to find cached resource: $location" >&2 | |
exit 1 | |
fi | |
fi | |
####################################################################### | |
# | |
# initialization | |
# | |
####################################################################### | |
# create a temporary directory | |
tmp_dir=$( mktemp -d 2>/dev/null || mktemp -d -t "${script_name%.*}" ) | |
if [ ! -d "$tmp_dir" ] ; then | |
printf "ERROR: Unable to create temporary dir\n" >&2 | |
exit 1 | |
fi | |
$verbose_mode && printf "$script_name using temp dir: %s\n" "$tmp_dir" | |
tmp_header_file="$tmp_dir/${script_name%%.*}_tmp_header" | |
tmp_content_file="$tmp_dir/${script_name%%.*}_tmp_content" | |
if $verbose_mode; then | |
echo "${script_name} using temp header file: ${tmp_header_file}" | |
echo "${script_name} using temp content file: ${tmp_content_file}" | |
fi | |
####################################################################### | |
# | |
# GET the web resource | |
# | |
####################################################################### | |
# init curl command-line options | |
curl_opts="--silent --show-error" | |
curl_opts="${curl_opts} --user-agent '${user_agent_string}'" | |
curl_opts="${curl_opts} --dump-header '${tmp_header_file}'" | |
curl_opts="${curl_opts} --output '${tmp_content_file}'" | |
# add conditional GET headers (if possible) | |
conditional_get=false | |
if [ -f "$cached_header_file" ] && [ -f "$cached_content_file" ]; then | |
if ! $force_get_mode; then | |
header_value=$( /bin/cat $cached_header_file \ | |
| grep -F 'Last-Modified' | /usr/bin/tr -d "\r" \ | |
| sed -e 's/^[^:]*: [ ]*//' -e 's/[ ]*$//' | |
) | |
if [ -n "$header_value" ]; then | |
conditional_get=true | |
curl_opts="${curl_opts} --header 'If-Modified-Since: $header_value'" | |
fi | |
header_value=$( /bin/cat $cached_header_file \ | |
| grep -F 'ETag' | /usr/bin/tr -d "\r" \ | |
| sed -e 's/^[^:]*: [ ]*//' -e 's/[ ]*$//' | |
) | |
if [ -n "$header_value" ]; then | |
conditional_get=true | |
curl_opts="${curl_opts} --header 'If-None-Match: $header_value'" | |
fi | |
fi | |
fi | |
# invoke curl | |
cmd="/usr/bin/curl $curl_opts $location" | |
$verbose_mode && printf "${script_name} issuing curl command: %s\n" "$cmd" | |
eval $cmd | |
exit_status=$? | |
if [ $exit_status -ne 0 ]; then | |
echo "ERROR: ${script_name} failed" >&2 | |
exit $exit_status | |
fi | |
if [ ! -f "$tmp_header_file" ]; then | |
echo "ERROR: ${script_name} unable to find header file ${tmp_header_file}" >&2 | |
exit 1 | |
fi | |
if $verbose_mode; then | |
echo "${script_name} received the following response:" | |
/bin/cat "${tmp_header_file}" | while read line; do | |
echo "> $line" | |
done | |
fi | |
####################################################################### | |
# | |
# process the response | |
# | |
####################################################################### | |
response_code=$( /bin/cat "$tmp_header_file" | /usr/bin/head -1 \ | |
| sed -e 's/^[^ ]* \([^ ]*\) .*$/\1/' | |
) | |
$verbose_mode && printf "${script_name} received response code: %d\n" "$response_code" | |
if [ "$response_code" = "200" ]; then | |
# sanity check | |
declared_content_length=$( /bin/cat "$tmp_header_file" \ | |
| grep -F 'Content-Length' | /usr/bin/tr -d "\r" \ | |
| sed -e 's/^[^:]*: [ ]*//' -e 's/[ ]*$//' | |
) | |
actual_content_length=$( /bin/cat $tmp_content_file | /usr/bin/wc -c \ | |
| sed -e 's/^[ ]*//' -e 's/[ ]*$//' | |
) | |
if [ -n "$declared_content_length" ]; then | |
if [ "$declared_content_length" != "$actual_content_length" ]; then | |
echo "ERROR: ${script_name} failed content length check" >&2 | |
exit 1 | |
fi | |
else | |
echo "WARNING: Content-Length response header missing" >&2 | |
fi | |
if $verbose_mode; then | |
echo "${script_name} downloaded ${actual_content_length} bytes" | |
if $conditional_get; then | |
echo "${script_name} refreshing cache files" | |
else | |
echo "${script_name} initializing cache files" | |
fi | |
fi | |
# update the cache; maintain cache integrity at all times | |
/bin/cp -f "$tmp_header_file" "$cached_header_file" >&2 | |
exit_status=$? | |
if [ $exit_status -ne 0 ]; then | |
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2 | |
echo "ERROR: ${script_name} failed copy to file ${cached_header_file}" >&2 | |
exit $exit_status | |
fi | |
/bin/cp -f "$tmp_content_file" "$cached_content_file" >&2 | |
exit_status=$? | |
if [ $exit_status -ne 0 ]; then | |
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2 | |
echo "ERROR: ${script_name} failed copy to file ${cached_content_file}" >&2 | |
exit $exit_status | |
fi | |
elif [ "$response_code" = "304" ]; then | |
if $verbose_mode; then | |
echo "${script_name} downloaded 0 bytes (cache is up-to-date)" | |
fi | |
else | |
echo "ERROR: ${script_name} failed with HTTP response code ${response_code}" >&2 | |
exit 1 | |
fi | |
####################################################################### | |
# | |
# output the resource content | |
# | |
####################################################################### | |
if [ -f "$cached_header_file" ] && [ -f "$cached_content_file" ]; then | |
$verbose_mode && echo "${script_name} complete." | |
if $output_headers; then | |
/bin/cat "${tmp_header_file}" | |
else | |
/bin/cat "${cached_content_file}" | |
fi | |
exit 0 | |
else | |
# ensure cache integrity | |
/bin/rm -f "$cached_header_file" "$cached_content_file" >&2 | |
echo "ERROR: ${script_name} failed to get metadata" >&2 | |
exit 1 | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment