Created
October 7, 2016 10:44
-
-
Save drAlberT/739eff41ef654e4f41f80396b79a7838 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Shell script to make a locally browseable version of a given URL | |
# | |
# @author Emiliano Gabrielli <[email protected]> | |
# @license MIT | |
LC_ALL=C | |
LANG=C | |
FONT_DEF=$'\033[0m' | |
FONT_BOLD=$'\033[1m' | |
FONT_UNDER=$'\033[4m' | |
FONT_BLINK=$'\033[5m' | |
COL_DEF=$'\033[39m' | |
COL_RED=$'\033[31m' | |
COL_GREEN=$'\033[32m' | |
COL_YELL=$'\033[33m' | |
COL_BLUE=$'\033[34m' | |
COL_MAGEN=$'\033[35m' | |
RESET_DEF="${FONT_DEF}${COL_DEF}" | |
function on_exit() | |
{ | |
echo ${RESET_DEF} | |
if [ -f "${TEMPFILE}" ] ; then | |
rm -rf ${TEMPFILE} | |
fi | |
} | |
trap on_exit EXIT # Enable emergency handler | |
# Ignore CTRL-C only in this shell, so we can interrupt subprocesses. | |
trap 'echo -e "\nReally abort? CTRL-C again to abort.\n"; trap "-" INT QUIT TSTP' INT QUIT TSTP | |
USAGE=" | |
${COL_RED} | |
Usage: ${0##*/} <URL> | |
${RESET_DEF} | |
" | |
URL=${1:?"$USAGE"} | |
DOMAINS=${DOMAINS:=""} | |
TGT_DIR=${TARGET:="./MIRROR"} | |
SKIP=${SKIP:="awstats_misc_tracker.js"} | |
TAGS=${TAGS:="src href url"} | |
########################################################################################################## | |
HOST="$( echo ${URL} | sed -E -e "s|https?://([^@]*@)?([^:/]*).*|\2|" )" | |
WGET_OPTS="--continue | |
--no-cache | |
--no-dns-cache | |
--no-check-certificate | |
-e robots=off | |
--no-host-directories | |
--directory-prefix=${TGT_DIR} | |
--default-page=${DEFAULT_PAGE:="index.html"} | |
--adjust-extension | |
--convert-links | |
--page-requisites | |
--domains=${HOST},${DOMAINS} | |
--recursive | |
--timestamping | |
--level inf | |
--no-remove-listing | |
" | |
#XXX remove existing stuff | |
echo "Going to execute ${FONT_BOLD}${COL_RED}rm -rf '${TGT_DIR}/*'${FONT_DEF} ... continue? [y/N]${RESET_DEF}" | |
read RESP | |
[ "Xy" != "X${RESP}" ] && exit | |
rm -rf ${TGT_DIR}/* | |
#XXX first mirror the URL | |
#FIXME | |
wget $WGET_OPTS $URL | |
echo wget $WGET_OPTS $URL | |
sleep 2 | |
#XXX then parse downloaded files and retrieve missing elements | |
_FS="@@#@@" | |
for _TAG in $TAGS | |
do | |
for _M in $( grep -ro ${_TAG}'\s*=\s*['"'"'"][^"'"'"']*['"'"'"]' ${TGT_DIR} \ | |
| sed -E -e 's|^(.*):'${_TAG}'[[:space:]]*=[[:space:]]*(["'"'"']?)([^"'"'"']*)(["'"'"']?)|\1'$_FS'\3|' \ | |
| sort \ | |
| uniq \ | |
) | |
do | |
_FILE=${_M%%${_FS}*} | |
_TGT=${_M##*${_FS}} | |
#XXX skip empty targets | |
[ "X$_TGT" != "X" ] || continue | |
#XXX skip external URLs | |
echo $_TGT | grep -qvi '^https\?://' || continue | |
#XXX skip explicitely marked URLs | |
echo $_TGT | grep -qvi "${SKIP}" || continue | |
#XXX skip relative links | |
echo $_TGT | grep -qvi '^[^/]' || continue | |
#echo "Tag: '$_TAG', Match: '$_M' --- FILE: '$_FILE' _TGT: '$_TGT', " | |
#XXX actually retrieve missing files | |
echo -n "Retrieving ${_TGT:1} ... " | |
wget --quiet --continue ${URL}/${_TGT:1} -O ${TGT_DIR}/${_TGT:1} | |
if [ 0 -ne $? ]; | |
then | |
RET="${FONT_BOLD}${COL_RED}FAILED${RESET_DEF}" | |
#skip files missing on remote side | |
continue | |
fi | |
RET="${FONT_BOLD}${COL_GREEN}DONE${RESET_DEF}" | |
echo "${RET}." | |
#XXX make absolute URL relative | |
_SLASHES=$(echo ${_FILE#"${TGT_DIR}"} | sed -E -e 's#[^/]##g') | |
PREFIX="" | |
for (( c=0; c<${#_SLASHES}-1 ; ++c )); | |
do | |
PREFIX="${PREFIX}../" | |
done | |
#echo FILE: $_FILE , TARGET: "${PREFIX}${_TGT:1}" | |
sed -i '' -E -e "s#${_TGT}#"${PREFIX}${_TGT:1}"#" "${_FILE}" | |
done | |
done | |
#TODO | |
# awk -v f="___REPLACEMENT___" 'BEGIN {while (getline < f) txt=tt $0 "\n"} /___PLACEHOLDER___/ {sub("___PLACEHOLDER___", txt)} 1' ___FILE___ | |
exit 0 | |
# vim: set ts=2 sw=2 ai sts=2 ft=sh : |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment