qnkhuat · March 3, 2022 10:20 · qnkhuat · Aug 2, 2021
diff --git a/scan.sh b/scan.sh
 #!/bin/bash
 set -o noglob # prevent bash from auto glob file extensions
 set -e # exit if any commands yield error

 # constants
 readonly CRed='\033[0;31m'          # Red
 readonly CGreen='\033[0;32m'        # Green
 readonly CYellow='\033[0;33m'       # Yellow
 readonly CBlue='\033[0;34m'         # Blue
 readonly CWhite='\033[0;37m'        # White
 readonly MAX_CONCURRENT=20

 # args
 VERBOSE=false
 QUICK=false
 EXCLUDE=""
 DOMAIN=""
 INCLUDE="*"
 CURLARGS=""
 DIR="./"

 help() {
  echo -e "usage: ./scan.sh .
  Example:

    - Check all links with for domain fig.io in mark downfiles in current directory
      $CGreen./scan.sh . -i \"*.md\" -d \"fig.io\" $CWhite
    - Scan all json files and markdown except package.json in /dev
      $CGreen./scan.sh dev/ -i \"*.json,*.md\" -e \"package*.json\" $CWhite
  -v, --verbose
    Verbose mode
  -f, --follow
    Follow redirects link. (Default is not follow)
  -e, --exclude
    Regex to exclude files. Seperated by comma
    I.e: package*.json,node_modules/*
  -i, --include
    Regex to search files. Seperated by comma
    I.e: *.md,*.txt
  -d, --domain
    Scan for a specific domain.
    I.e: fig.io
  -q, --quick
    Scan quickly. This might open up many curl background processes
  "
 }


 # Build arguments
 while [[ "$1" != "" ]];
 do case $1 in

  -h | --help )
    help
    exit 0
    ;;

  -f | --follow )
    CURLARGS+='-L '
    ;;

  -v | --verbose )
    VERBOSE=true
    ;;

  -e | --exclude )
    shift
    EXCLUDE=$1
    ;;

  -i | --include )
    shift
    INCLUDE=$1
    ;;

  -d | --domain )
    shift
    DOMAIN="$1/"
    ;;

  -q | --quick )
    QUICK=true
    ;;

  -*)
    echo "Invalid argument $1"
    exit 1
    ;;

  * )
    DIR=$1
    ;;

 esac; shift; done

 URL_PATTERN="(http|https)://(?:www\.)$DOMAIN[a-zA-Z0-9./?=_%:-]*)"
 #URL_PATTERN="(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)$DOMAIN?([^:\/?\n)(]+)*"

 # $1 is a string represents list of option seperated by ','. I.e: '*.md,*.json'
 # $2 is option. I.e: -name
 # $3 is prefix to add if multiple values are passed. I.e: -or
 _expand_find_options() {

  string_list=$1
  option=$2
  prefix_option=$3

  local result=""
  first=true
  for i in ${string_list//,/ }
  do
    if [ "$first" = "false" ];
    then
      result+="$prefix_option ";
    else
      first=false;
    fi

    result+="$option '$i' "
  done
  echo $result
 }

 # List all files that match arguments using find
 list_files() {
  local INCLUDE_OPTIONS=""
  local EXCLUDE_OPTIONS=""

  if [ ! -z "$INCLUDE" ]; then
    INCLUDE_OPTIONS="$(_expand_find_options $INCLUDE -name -or)"
  fi

  if [ ! -z "$EXCLUDE" ]; then
    EXCLUDE_OPTIONS="$(_expand_find_options $EXCLUDE '-not -name' -and)"
  fi

  cmd="find $DIR $INCLUDE_OPTIONS $EXCLUDE_OPTIONS" # order matter

  FILE_LIST=$(eval $cmd)
  echo $FILE_LIST
 }


 process_line() {
  filename=$1
  lineno=$2
  url="${3/)/}"
  statuscode=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 5 $CURLARGS $url &)

  if [[ ${statuscode:0:1} == "2" ]]; then
    if [[ "$VERBOSE" == true ]]; then
      echo -e "$CGreen[OK  ]$CWhite $statuscode - $filename:$lineno\t$url"
    fi
  elif [[ ${statuscode:0:1} == "3" ]]; then
    echo -e "$CYellow[WARN]$CWhite $statuscode - $filename:$lineno\t$url"
  else
    echo -e "$CRed[ERR ]$CWhite $statuscode - $filename:$lineno\t$url"
  fi
 }

 scan_file() {
  filepath=$1
  # Grep's output has format FILENAMME:FILENO:https://..
  # the 2 seds are used to format grep result from "FILENAMME:FILENO:https://.." to "FILENAME FILENO https://.."
  local output=$(grep --extended-regexp --only-matching --recursive --with-filename --line-number --binary-files=without-match $URL_PATTERN $filepath | sed 's/:/ /' | sed 's/:/ /')
  while read -r line;
  do
    if [ ! -z "$line" ]; then
      process_line $line

    # wait if execeeds concurrent jobs
    [$( jobs | wc -l ) -ge $MAX_CONCURRENT ] && wait
    fi
  done <<< "$output"
  wait

  }

 # main
 FILE_LIST=$(list_files)
 echo -e "Found $CGreen$(echo $FILE_LIST | tr " " "\n" | wc -l | xargs)$CWhite files"
 if [[ "$VERBOSE" == true ]]; then
  echo -e "$FILE_LIST" | tr " " "\n"
 fi

 for f in $FILE_LIST;
 do
  if [[ "$VERBOSE" == true ]]; then
    echo -e "Scanning file $CGreen$f$CWhite"
  fi

  if [[ "$QUICK" == true ]];then
    scan_file $f &

    # wait if execeeds concurrent jobs
    [$( jobs | wc -l ) -ge $MAX_CONCURRENT ] && wait
  else
    scan_file $f
  fi
 done
 wait
	#!/bin/bash
	set -o noglob # prevent bash from auto glob file extensions
	set -e # exit if any commands yield error

	# constants
	readonly CRed='\033[0;31m' # Red
	readonly CGreen='\033[0;32m' # Green
	readonly CYellow='\033[0;33m' # Yellow
	readonly CBlue='\033[0;34m' # Blue
	readonly CWhite='\033[0;37m' # White
	readonly MAX_CONCURRENT=20

	# args
	VERBOSE=false
	QUICK=false
	EXCLUDE=""
	DOMAIN=""
	INCLUDE="*"
	CURLARGS=""
	DIR="./"

	help() {
	echo -e "usage: ./scan.sh .
	Example:

	- Check all links with for domain fig.io in mark downfiles in current directory
	$CGreen./scan.sh . -i \"*.md\" -d \"fig.io\" $CWhite
	- Scan all json files and markdown except package.json in /dev
	$CGreen./scan.sh dev/ -i \".json,.md\" -e \"package*.json\" $CWhite
	-v, --verbose
	Verbose mode
	-f, --follow
	Follow redirects link. (Default is not follow)
	-e, --exclude
	Regex to exclude files. Seperated by comma
	I.e: package.json,node_modules/
	-i, --include
	Regex to search files. Seperated by comma
	I.e: .md,.txt
	-d, --domain
	Scan for a specific domain.
	I.e: fig.io
	-q, --quick
	Scan quickly. This might open up many curl background processes
	"
	}


	# Build arguments
	while [[ "$1" != "" ]];
	do case $1 in

	-h \| --help )
	help
	exit 0
	;;

	-f \| --follow )
	CURLARGS+='-L '
	;;

	-v \| --verbose )
	VERBOSE=true
	;;

	-e \| --exclude )
	shift
	EXCLUDE=$1
	;;

	-i \| --include )
	shift
	INCLUDE=$1
	;;

	-d \| --domain )
	shift
	DOMAIN="$1/"
	;;

	-q \| --quick )
	QUICK=true
	;;

	-*)
	echo "Invalid argument $1"
	exit 1
	;;

	* )
	DIR=$1
	;;

	esac; shift; done

	URL_PATTERN="(http\|https)://(?:www\.)$DOMAIN[a-zA-Z0-9./?=_%:-]*)"
	#URL_PATTERN="(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)$DOMAIN?([^:\/?\n)(]+)*"

	# $1 is a string represents list of option seperated by ','. I.e: '.md,.json'
	# $2 is option. I.e: -name
	# $3 is prefix to add if multiple values are passed. I.e: -or
	_expand_find_options() {

	string_list=$1
	option=$2
	prefix_option=$3

	local result=""
	first=true
	for i in ${string_list//,/ }
	do
	if [ "$first" = "false" ];
	then
	result+="$prefix_option ";
	else
	first=false;
	fi

	result+="$option '$i' "
	done
	echo $result
	}

	# List all files that match arguments using find
	list_files() {
	local INCLUDE_OPTIONS=""
	local EXCLUDE_OPTIONS=""

	if [ ! -z "$INCLUDE" ]; then
	INCLUDE_OPTIONS="$(_expand_find_options $INCLUDE -name -or)"
	fi

	if [ ! -z "$EXCLUDE" ]; then
	EXCLUDE_OPTIONS="$(_expand_find_options $EXCLUDE '-not -name' -and)"
	fi

	cmd="find $DIR $INCLUDE_OPTIONS $EXCLUDE_OPTIONS" # order matter

	FILE_LIST=$(eval $cmd)
	echo $FILE_LIST
	}


	process_line() {
	filename=$1
	lineno=$2
	url="${3/)/}"
	statuscode=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 5 $CURLARGS $url &)

	if [[ ${statuscode:0:1} == "2" ]]; then
	if [[ "$VERBOSE" == true ]]; then
	echo -e "$CGreen[OK ]$CWhite $statuscode - $filename:$lineno\t$url"
	fi
	elif [[ ${statuscode:0:1} == "3" ]]; then
	echo -e "$CYellow[WARN]$CWhite $statuscode - $filename:$lineno\t$url"
	else
	echo -e "$CRed[ERR ]$CWhite $statuscode - $filename:$lineno\t$url"
	fi
	}

	scan_file() {
	filepath=$1
	# Grep's output has format FILENAMME:FILENO:https://..
	# the 2 seds are used to format grep result from "FILENAMME:FILENO:https://.." to "FILENAME FILENO https://.."
	local output=$(grep --extended-regexp --only-matching --recursive --with-filename --line-number --binary-files=without-match $URL_PATTERN $filepath \| sed 's/:/ /' \| sed 's/:/ /')
	while read -r line;
	do
	if [ ! -z "$line" ]; then
	process_line $line

	# wait if execeeds concurrent jobs
	[$( jobs \| wc -l ) -ge $MAX_CONCURRENT ] && wait
	fi
	done <<< "$output"
	wait

	}

	# main
	FILE_LIST=$(list_files)
	echo -e "Found $CGreen$(echo $FILE_LIST \| tr " " "\n" \| wc -l \| xargs)$CWhite files"
	if [[ "$VERBOSE" == true ]]; then
	echo -e "$FILE_LIST" \| tr " " "\n"
	fi

	for f in $FILE_LIST;
	do
	if [[ "$VERBOSE" == true ]]; then
	echo -e "Scanning file $CGreen$f$CWhite"
	fi

	if [[ "$QUICK" == true ]];then
	scan_file $f &

	# wait if execeeds concurrent jobs
	[$( jobs \| wc -l ) -ge $MAX_CONCURRENT ] && wait
	else
	scan_file $f
	fi
	done
	wait