Skip to content

Instantly share code, notes, and snippets.

@qnkhuat
Last active March 3, 2022 10:20
Show Gist options
  • Save qnkhuat/797d26cabf5d8529c2c67bd2589b85d2 to your computer and use it in GitHub Desktop.
Save qnkhuat/797d26cabf5d8529c2c67bd2589b85d2 to your computer and use it in GitHub Desktop.
Bash script to scan dead links in a repo
#!/bin/bash
set -o noglob # prevent bash from auto glob file extensions
set -e # exit if any commands yield error
# constants
readonly CRed='\033[0;31m' # Red
readonly CGreen='\033[0;32m' # Green
readonly CYellow='\033[0;33m' # Yellow
readonly CBlue='\033[0;34m' # Blue
readonly CWhite='\033[0;37m' # White
readonly MAX_CONCURRENT=20
# args
VERBOSE=false
QUICK=false
EXCLUDE=""
DOMAIN=""
INCLUDE="*"
CURLARGS=""
DIR="./"
help() {
echo -e "usage: ./scan.sh .
Example:
- Check all links with for domain fig.io in mark downfiles in current directory
$CGreen./scan.sh . -i \"*.md\" -d \"fig.io\" $CWhite
- Scan all json files and markdown except package.json in /dev
$CGreen./scan.sh dev/ -i \"*.json,*.md\" -e \"package*.json\" $CWhite
-v, --verbose
Verbose mode
-f, --follow
Follow redirects link. (Default is not follow)
-e, --exclude
Regex to exclude files. Seperated by comma
I.e: package*.json,node_modules/*
-i, --include
Regex to search files. Seperated by comma
I.e: *.md,*.txt
-d, --domain
Scan for a specific domain.
I.e: fig.io
-q, --quick
Scan quickly. This might open up many curl background processes
"
}
# Build arguments
while [[ "$1" != "" ]];
do case $1 in
-h | --help )
help
exit 0
;;
-f | --follow )
CURLARGS+='-L '
;;
-v | --verbose )
VERBOSE=true
;;
-e | --exclude )
shift
EXCLUDE=$1
;;
-i | --include )
shift
INCLUDE=$1
;;
-d | --domain )
shift
DOMAIN="$1/"
;;
-q | --quick )
QUICK=true
;;
-*)
echo "Invalid argument $1"
exit 1
;;
* )
DIR=$1
;;
esac; shift; done
URL_PATTERN="(http|https)://(?:www\.)$DOMAIN[a-zA-Z0-9./?=_%:-]*)"
#URL_PATTERN="(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)$DOMAIN?([^:\/?\n)(]+)*"
# $1 is a string represents list of option seperated by ','. I.e: '*.md,*.json'
# $2 is option. I.e: -name
# $3 is prefix to add if multiple values are passed. I.e: -or
_expand_find_options() {
string_list=$1
option=$2
prefix_option=$3
local result=""
first=true
for i in ${string_list//,/ }
do
if [ "$first" = "false" ];
then
result+="$prefix_option ";
else
first=false;
fi
result+="$option '$i' "
done
echo $result
}
# List all files that match arguments using find
list_files() {
local INCLUDE_OPTIONS=""
local EXCLUDE_OPTIONS=""
if [ ! -z "$INCLUDE" ]; then
INCLUDE_OPTIONS="$(_expand_find_options $INCLUDE -name -or)"
fi
if [ ! -z "$EXCLUDE" ]; then
EXCLUDE_OPTIONS="$(_expand_find_options $EXCLUDE '-not -name' -and)"
fi
cmd="find $DIR $INCLUDE_OPTIONS $EXCLUDE_OPTIONS" # order matter
FILE_LIST=$(eval $cmd)
echo $FILE_LIST
}
process_line() {
filename=$1
lineno=$2
url="${3/)/}"
statuscode=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 5 $CURLARGS $url &)
if [[ ${statuscode:0:1} == "2" ]]; then
if [[ "$VERBOSE" == true ]]; then
echo -e "$CGreen[OK ]$CWhite $statuscode - $filename:$lineno\t$url"
fi
elif [[ ${statuscode:0:1} == "3" ]]; then
echo -e "$CYellow[WARN]$CWhite $statuscode - $filename:$lineno\t$url"
else
echo -e "$CRed[ERR ]$CWhite $statuscode - $filename:$lineno\t$url"
fi
}
scan_file() {
filepath=$1
# Grep's output has format FILENAMME:FILENO:https://..
# the 2 seds are used to format grep result from "FILENAMME:FILENO:https://.." to "FILENAME FILENO https://.."
local output=$(grep --extended-regexp --only-matching --recursive --with-filename --line-number --binary-files=without-match $URL_PATTERN $filepath | sed 's/:/ /' | sed 's/:/ /')
while read -r line;
do
if [ ! -z "$line" ]; then
process_line $line
# wait if execeeds concurrent jobs
[$( jobs | wc -l ) -ge $MAX_CONCURRENT ] && wait
fi
done <<< "$output"
wait
}
# main
FILE_LIST=$(list_files)
echo -e "Found $CGreen$(echo $FILE_LIST | tr " " "\n" | wc -l | xargs)$CWhite files"
if [[ "$VERBOSE" == true ]]; then
echo -e "$FILE_LIST" | tr " " "\n"
fi
for f in $FILE_LIST;
do
if [[ "$VERBOSE" == true ]]; then
echo -e "Scanning file $CGreen$f$CWhite"
fi
if [[ "$QUICK" == true ]];then
scan_file $f &
# wait if execeeds concurrent jobs
[$( jobs | wc -l ) -ge $MAX_CONCURRENT ] && wait
else
scan_file $f
fi
done
wait
@qnkhuat
Copy link
Author

qnkhuat commented Aug 2, 2021

Screen Shot 2021-08-02 at 18 20 21

The output looks something like this

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment