icedraco · November 27, 2014 11:29
diff --git a/web-crawler.sh b/web-crawler.sh
 #!/bin/bash
 ###
 # A small script that recursively crawls a URL and fetches e-mail addresses from
 # all the pages it can find. Written for a friend studying BASH for his Systems
 # Engineering degree.
 #
 # Author:  IceDragon <[email protected]>
 # Contact: http://www.icerealm.org/contact
 #

 ###--# CONFIGURATION #--###########################################################
 # This is where we store URLs we've already been to, so we won't visit them again.
 VISITED_URLS_FILE="/tmp/crawler_visited_urls"

 # This is where we store the matches (e-mails or whatever they wanted), just so we
 # could count it in the summary - not because we need it elsewhere :D
 MATCHES_FILE="/tmp/crawler_matches"

 # Regular expression to extract URLs with from within each HTML file.
 EXPR_URL_MATCH="http://[^'\"]\+\.html"

 # Default expression to look for in case and -e wasn't specified by the user. This
 # grabs all the e-mail addresses from each page we go through.
 EXPRESSION="[A-Za-z0-9._%\\+-]\\+@[A-Za-z0-9.-]\\+\\.[A-Za-z]\\{2,6\\}"

 # Amount of seconds to run if no runtime was specified.
 RUNTIME=10

 # Request timeout for busted servers (so we don't wait over a minute for it to fail)
 WGET_TIMEOUT=10
 WGET_RETRIES=0

 ###--# EXECUTION #--##############################################################
 START_TIME=`date +%s`

 # TODO: Something better than this. If this variable is set, we're in mid-recursion
 #       (or some idiot user stopped the script in mid-operation)
 if [[ "$CRAWLER_RECURSION" == "" ]]; then
        # Remove stale data files from previous runs (if any)
        rm -f $VISITED_URLS_FILE
        rm -f $MATCHES_FILE
        export CRAWLER_RECURSION=1

        # Create files so there won't be errors reading them before they're formed.
        touch $VISITED_URLS_FILE $MATCHES_FILE
 else
        let CRAWLER_RECURSION++
 fi


 # Process parameters from user
 while getopts e:t:s OPTION; do
 case "$OPTION" in
 e)
        EXPRESSION=$OPTARG
 ;;
 t)
        RUNTIME=$OPTARG
 ;;
 s)
        SUMMARY=1
 ;;
 *)
        echo "nothing"
 ;;
 esac
 done

 # If they specified a zero or negative runtime, then we won't have time to do
 # anything, obviously. So we quit here as work done.
 #
 # The user is usually smarter than that, but recursions won't be. That's how we
 # exit one.
 if [[ $RUNTIME -le 0 ]]; then
        exit 0
 fi

 # Figure out a URL from the user. If they didn't specify any, $URL will be blank
 while shift; do
        if [[ "$1" == *://* ]]; then
                URL=$1
                break
        fi
 done

 # Show us where you are at the moment (in case and there are problems)
 if [[ "$DEBUG" != "" ]]; then
        echo ">> $URL / T=$RUNTIME / CRAWLER_RECURSION=$CRAWLER_RECURSION"
 fi

 # Sanity check on the arguments
 if [ "$URL" == "" ]; then
        echo "Syntax: $0 [-e expression] [-t runtime] [-s] URL"
        exit 1
 fi

 # Grab the data from the URL we were given.
 TMP_PAGE=`mktemp`
 wget $URL -t $WGET_RETRIES -T $WGET_TIMEOUT -qO $TMP_PAGE
 echo $URL >> $VISITED_URLS_FILE

 # If something terrible happened, let the caller know.
 if [ $? != 0 ]; then
        # Clean the temp file up, because noone else will do it
        rm $TMP_PAGE
        exit $?
 fi

 # Extract results and display them along with the URL to the user.
 # Storing each one into a file for counting purposes, as well...
 RESULTS=`grep -no $EXPRESSION $TMP_PAGE`

 for result in $RESULTS; do
        echo "$URL $result"

        # Split line number and the data itself into different variables.
        read LINE DATA <<< `echo $result |tr ":" " "`
        echo $DATA >> $MATCHES_FILE
 done

 # Extract the URLs from within the data we received.
 URL_LIST=`grep -o $EXPR_URL_MATCH $TMP_PAGE`

 # We don't need $TMP_PAGE file anymore - we got everything we wanted from
 # it. Clean up after ourselves.
 rm $TMP_PAGE

 # Go through each URL and activate this very script to handle the rest
 # (a recursion, if you will)
 for u in $URL_LIST; do
        CURRENT_TIME=`date +%s`

        # Check if we have time for it.
        if [[ $(($START_TIME + $RUNTIME)) -le $CURRENT_TIME ]]; then
                break
        fi

        # If we already visited this URL, don't do it again.
        if [ ! `grep -o $u $VISITED_URLS_FILE` ]; then
                $0 -t $(($RUNTIME - ($CURRENT_TIME - $START_TIME))) -e $EXPRESSION $u
        fi
 done


 ###--# Display Summary #--###
 if [[ "$SUMMARY" != "" ]]; then
        URLS_VISITED=`wc -l $VISITED_URLS_FILE |cut -d " " -f 1`
        ELAPSED_TIME=$((`date +%s` - $START_TIME))
        echo ""
        echo "================================================================="
        echo "Total time: $ELAPSED_TIME seconds"
        echo "Total Number of pages scanned: $URLS_VISITED"
        echo "Total results: `wc -l $MATCHES_FILE |cut -d " " -f 1`"
        echo "Average scanning rate: `echo "scale=2; $URLS_VISITED/$ELAPSED_TIME" |bc` URLs per second"
        echo "================================================================="
 fi


 # TODO: Something better than this (see above)
 if [[ "$CRAWLER_RECURSION" == "1" ]]; then
        # Clean shit up
        rm $MATCHES_FILE
        rm $VISITED_URLS_FILE
        unset CRAWLER_RECURSION
 fi

 let CRAWLER_RECURSION--
	#!/bin/bash
	###
	# A small script that recursively crawls a URL and fetches e-mail addresses from
	# all the pages it can find. Written for a friend studying BASH for his Systems
	# Engineering degree.
	#
	# Author: IceDragon <[email protected]>
	# Contact: http://www.icerealm.org/contact
	#

	###--# CONFIGURATION #--###########################################################
	# This is where we store URLs we've already been to, so we won't visit them again.
	VISITED_URLS_FILE="/tmp/crawler_visited_urls"

	# This is where we store the matches (e-mails or whatever they wanted), just so we
	# could count it in the summary - not because we need it elsewhere :D
	MATCHES_FILE="/tmp/crawler_matches"

	# Regular expression to extract URLs with from within each HTML file.
	EXPR_URL_MATCH="http://[^'\"]\+\.html"

	# Default expression to look for in case and -e wasn't specified by the user. This
	# grabs all the e-mail addresses from each page we go through.
	EXPRESSION="[A-Za-z0-9._%\\+-]\\+@[A-Za-z0-9.-]\\+\\.[A-Za-z]\\{2,6\\}"

	# Amount of seconds to run if no runtime was specified.
	RUNTIME=10

	# Request timeout for busted servers (so we don't wait over a minute for it to fail)
	WGET_TIMEOUT=10
	WGET_RETRIES=0

	###--# EXECUTION #--##############################################################
	START_TIME=`date +%s`

	# TODO: Something better than this. If this variable is set, we're in mid-recursion
	# (or some idiot user stopped the script in mid-operation)
	if [[ "$CRAWLER_RECURSION" == "" ]]; then
	# Remove stale data files from previous runs (if any)
	rm -f $VISITED_URLS_FILE
	rm -f $MATCHES_FILE
	export CRAWLER_RECURSION=1

	# Create files so there won't be errors reading them before they're formed.
	touch $VISITED_URLS_FILE $MATCHES_FILE
	else
	let CRAWLER_RECURSION++
	fi


	# Process parameters from user
	while getopts e:t:s OPTION; do
	case "$OPTION" in
	e)
	EXPRESSION=$OPTARG
	;;
	t)
	RUNTIME=$OPTARG
	;;
	s)
	SUMMARY=1
	;;
	*)
	echo "nothing"
	;;
	esac
	done

	# If they specified a zero or negative runtime, then we won't have time to do
	# anything, obviously. So we quit here as work done.
	#
	# The user is usually smarter than that, but recursions won't be. That's how we
	# exit one.
	if [[ $RUNTIME -le 0 ]]; then
	exit 0
	fi

	# Figure out a URL from the user. If they didn't specify any, $URL will be blank
	while shift; do
	if [[ "$1" == :// ]]; then
	URL=$1
	break
	fi
	done

	# Show us where you are at the moment (in case and there are problems)
	if [[ "$DEBUG" != "" ]]; then
	echo ">> $URL / T=$RUNTIME / CRAWLER_RECURSION=$CRAWLER_RECURSION"
	fi

	# Sanity check on the arguments
	if [ "$URL" == "" ]; then
	echo "Syntax: $0 [-e expression] [-t runtime] [-s] URL"
	exit 1
	fi

	# Grab the data from the URL we were given.
	TMP_PAGE=`mktemp`
	wget $URL -t $WGET_RETRIES -T $WGET_TIMEOUT -qO $TMP_PAGE
	echo $URL >> $VISITED_URLS_FILE

	# If something terrible happened, let the caller know.
	if [ $? != 0 ]; then
	# Clean the temp file up, because noone else will do it
	rm $TMP_PAGE
	exit $?
	fi

	# Extract results and display them along with the URL to the user.
	# Storing each one into a file for counting purposes, as well...
	RESULTS=`grep -no $EXPRESSION $TMP_PAGE`

	for result in $RESULTS; do
	echo "$URL $result"

	# Split line number and the data itself into different variables.
	read LINE DATA <<< `echo $result \|tr ":" " "`
	echo $DATA >> $MATCHES_FILE
	done

	# Extract the URLs from within the data we received.
	URL_LIST=`grep -o $EXPR_URL_MATCH $TMP_PAGE`

	# We don't need $TMP_PAGE file anymore - we got everything we wanted from
	# it. Clean up after ourselves.
	rm $TMP_PAGE

	# Go through each URL and activate this very script to handle the rest
	# (a recursion, if you will)
	for u in $URL_LIST; do
	CURRENT_TIME=`date +%s`

	# Check if we have time for it.
	if [[ $(($START_TIME + $RUNTIME)) -le $CURRENT_TIME ]]; then
	break
	fi

	# If we already visited this URL, don't do it again.
	if [ ! `grep -o $u $VISITED_URLS_FILE` ]; then
	$0 -t $(($RUNTIME - ($CURRENT_TIME - $START_TIME))) -e $EXPRESSION $u
	fi
	done


	###--# Display Summary #--###
	if [[ "$SUMMARY" != "" ]]; then
	URLS_VISITED=`wc -l $VISITED_URLS_FILE \|cut -d " " -f 1`
	ELAPSED_TIME=$((`date +%s` - $START_TIME))
	echo ""
	echo "================================================================="
	echo "Total time: $ELAPSED_TIME seconds"
	echo "Total Number of pages scanned: $URLS_VISITED"
	echo "Total results: `wc -l $MATCHES_FILE \|cut -d " " -f 1`"
	echo "Average scanning rate: `echo "scale=2; $URLS_VISITED/$ELAPSED_TIME" \|bc` URLs per second"
	echo "================================================================="
	fi


	# TODO: Something better than this (see above)
	if [[ "$CRAWLER_RECURSION" == "1" ]]; then
	# Clean shit up
	rm $MATCHES_FILE
	rm $VISITED_URLS_FILE
	unset CRAWLER_RECURSION
	fi

	let CRAWLER_RECURSION--