Skip to content

Instantly share code, notes, and snippets.

@drAlberT
Created October 7, 2016 10:44
Show Gist options
  • Save drAlberT/739eff41ef654e4f41f80396b79a7838 to your computer and use it in GitHub Desktop.
Save drAlberT/739eff41ef654e4f41f80396b79a7838 to your computer and use it in GitHub Desktop.
#!/bin/bash
# Shell script to make a locally browseable version of a given URL
#
# @author Emiliano Gabrielli <[email protected]>
# @license MIT
LC_ALL=C
LANG=C
FONT_DEF=$'\033[0m'
FONT_BOLD=$'\033[1m'
FONT_UNDER=$'\033[4m'
FONT_BLINK=$'\033[5m'
COL_DEF=$'\033[39m'
COL_RED=$'\033[31m'
COL_GREEN=$'\033[32m'
COL_YELL=$'\033[33m'
COL_BLUE=$'\033[34m'
COL_MAGEN=$'\033[35m'
RESET_DEF="${FONT_DEF}${COL_DEF}"
function on_exit()
{
echo ${RESET_DEF}
if [ -f "${TEMPFILE}" ] ; then
rm -rf ${TEMPFILE}
fi
}
trap on_exit EXIT # Enable emergency handler
# Ignore CTRL-C only in this shell, so we can interrupt subprocesses.
trap 'echo -e "\nReally abort? CTRL-C again to abort.\n"; trap "-" INT QUIT TSTP' INT QUIT TSTP
USAGE="
${COL_RED}
Usage: ${0##*/} <URL>
${RESET_DEF}
"
URL=${1:?"$USAGE"}
DOMAINS=${DOMAINS:=""}
TGT_DIR=${TARGET:="./MIRROR"}
SKIP=${SKIP:="awstats_misc_tracker.js"}
TAGS=${TAGS:="src href url"}
##########################################################################################################
HOST="$( echo ${URL} | sed -E -e "s|https?://([^@]*@)?([^:/]*).*|\2|" )"
WGET_OPTS="--continue
--no-cache
--no-dns-cache
--no-check-certificate
-e robots=off
--no-host-directories
--directory-prefix=${TGT_DIR}
--default-page=${DEFAULT_PAGE:="index.html"}
--adjust-extension
--convert-links
--page-requisites
--domains=${HOST},${DOMAINS}
--recursive
--timestamping
--level inf
--no-remove-listing
"
#XXX remove existing stuff
echo "Going to execute ${FONT_BOLD}${COL_RED}rm -rf '${TGT_DIR}/*'${FONT_DEF} ... continue? [y/N]${RESET_DEF}"
read RESP
[ "Xy" != "X${RESP}" ] && exit
rm -rf ${TGT_DIR}/*
#XXX first mirror the URL
#FIXME
wget $WGET_OPTS $URL
echo wget $WGET_OPTS $URL
sleep 2
#XXX then parse downloaded files and retrieve missing elements
_FS="@@#@@"
for _TAG in $TAGS
do
for _M in $( grep -ro ${_TAG}'\s*=\s*['"'"'"][^"'"'"']*['"'"'"]' ${TGT_DIR} \
| sed -E -e 's|^(.*):'${_TAG}'[[:space:]]*=[[:space:]]*(["'"'"']?)([^"'"'"']*)(["'"'"']?)|\1'$_FS'\3|' \
| sort \
| uniq \
)
do
_FILE=${_M%%${_FS}*}
_TGT=${_M##*${_FS}}
#XXX skip empty targets
[ "X$_TGT" != "X" ] || continue
#XXX skip external URLs
echo $_TGT | grep -qvi '^https\?://' || continue
#XXX skip explicitely marked URLs
echo $_TGT | grep -qvi "${SKIP}" || continue
#XXX skip relative links
echo $_TGT | grep -qvi '^[^/]' || continue
#echo "Tag: '$_TAG', Match: '$_M' --- FILE: '$_FILE' _TGT: '$_TGT', "
#XXX actually retrieve missing files
echo -n "Retrieving ${_TGT:1} ... "
wget --quiet --continue ${URL}/${_TGT:1} -O ${TGT_DIR}/${_TGT:1}
if [ 0 -ne $? ];
then
RET="${FONT_BOLD}${COL_RED}FAILED${RESET_DEF}"
#skip files missing on remote side
continue
fi
RET="${FONT_BOLD}${COL_GREEN}DONE${RESET_DEF}"
echo "${RET}."
#XXX make absolute URL relative
_SLASHES=$(echo ${_FILE#"${TGT_DIR}"} | sed -E -e 's#[^/]##g')
PREFIX=""
for (( c=0; c<${#_SLASHES}-1 ; ++c ));
do
PREFIX="${PREFIX}../"
done
#echo FILE: $_FILE , TARGET: "${PREFIX}${_TGT:1}"
sed -i '' -E -e "s#${_TGT}#"${PREFIX}${_TGT:1}"#" "${_FILE}"
done
done
#TODO
# awk -v f="___REPLACEMENT___" 'BEGIN {while (getline < f) txt=tt $0 "\n"} /___PLACEHOLDER___/ {sub("___PLACEHOLDER___", txt)} 1' ___FILE___
exit 0
# vim: set ts=2 sw=2 ai sts=2 ft=sh :
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment