Skip to content

Instantly share code, notes, and snippets.

@icedraco
Created November 27, 2014 11:29
Show Gist options
  • Save icedraco/b837c80c96f10608511b to your computer and use it in GitHub Desktop.
Save icedraco/b837c80c96f10608511b to your computer and use it in GitHub Desktop.
A web crawler done in BASH. Written for a friend at the Information Systems Engineering department.
#!/bin/bash
###
# A small script that recursively crawls a URL and fetches e-mail addresses from
# all the pages it can find. Written for a friend studying BASH for his Systems
# Engineering degree.
#
# Author: IceDragon <[email protected]>
# Contact: http://www.icerealm.org/contact
#
###--# CONFIGURATION #--###########################################################
# This is where we store URLs we've already been to, so we won't visit them again.
VISITED_URLS_FILE="/tmp/crawler_visited_urls"
# This is where we store the matches (e-mails or whatever they wanted), just so we
# could count it in the summary - not because we need it elsewhere :D
MATCHES_FILE="/tmp/crawler_matches"
# Regular expression to extract URLs with from within each HTML file.
EXPR_URL_MATCH="http://[^'\"]\+\.html"
# Default expression to look for in case and -e wasn't specified by the user. This
# grabs all the e-mail addresses from each page we go through.
EXPRESSION="[A-Za-z0-9._%\\+-]\\+@[A-Za-z0-9.-]\\+\\.[A-Za-z]\\{2,6\\}"
# Amount of seconds to run if no runtime was specified.
RUNTIME=10
# Request timeout for busted servers (so we don't wait over a minute for it to fail)
WGET_TIMEOUT=10
WGET_RETRIES=0
###--# EXECUTION #--##############################################################
START_TIME=`date +%s`
# TODO: Something better than this. If this variable is set, we're in mid-recursion
# (or some idiot user stopped the script in mid-operation)
if [[ "$CRAWLER_RECURSION" == "" ]]; then
# Remove stale data files from previous runs (if any)
rm -f $VISITED_URLS_FILE
rm -f $MATCHES_FILE
export CRAWLER_RECURSION=1
# Create files so there won't be errors reading them before they're formed.
touch $VISITED_URLS_FILE $MATCHES_FILE
else
let CRAWLER_RECURSION++
fi
# Process parameters from user
while getopts e:t:s OPTION; do
case "$OPTION" in
e)
EXPRESSION=$OPTARG
;;
t)
RUNTIME=$OPTARG
;;
s)
SUMMARY=1
;;
*)
echo "nothing"
;;
esac
done
# If they specified a zero or negative runtime, then we won't have time to do
# anything, obviously. So we quit here as work done.
#
# The user is usually smarter than that, but recursions won't be. That's how we
# exit one.
if [[ $RUNTIME -le 0 ]]; then
exit 0
fi
# Figure out a URL from the user. If they didn't specify any, $URL will be blank
while shift; do
if [[ "$1" == *://* ]]; then
URL=$1
break
fi
done
# Show us where you are at the moment (in case and there are problems)
if [[ "$DEBUG" != "" ]]; then
echo ">> $URL / T=$RUNTIME / CRAWLER_RECURSION=$CRAWLER_RECURSION"
fi
# Sanity check on the arguments
if [ "$URL" == "" ]; then
echo "Syntax: $0 [-e expression] [-t runtime] [-s] URL"
exit 1
fi
# Grab the data from the URL we were given.
TMP_PAGE=`mktemp`
wget $URL -t $WGET_RETRIES -T $WGET_TIMEOUT -qO $TMP_PAGE
echo $URL >> $VISITED_URLS_FILE
# If something terrible happened, let the caller know.
if [ $? != 0 ]; then
# Clean the temp file up, because noone else will do it
rm $TMP_PAGE
exit $?
fi
# Extract results and display them along with the URL to the user.
# Storing each one into a file for counting purposes, as well...
RESULTS=`grep -no $EXPRESSION $TMP_PAGE`
for result in $RESULTS; do
echo "$URL $result"
# Split line number and the data itself into different variables.
read LINE DATA <<< `echo $result |tr ":" " "`
echo $DATA >> $MATCHES_FILE
done
# Extract the URLs from within the data we received.
URL_LIST=`grep -o $EXPR_URL_MATCH $TMP_PAGE`
# We don't need $TMP_PAGE file anymore - we got everything we wanted from
# it. Clean up after ourselves.
rm $TMP_PAGE
# Go through each URL and activate this very script to handle the rest
# (a recursion, if you will)
for u in $URL_LIST; do
CURRENT_TIME=`date +%s`
# Check if we have time for it.
if [[ $(($START_TIME + $RUNTIME)) -le $CURRENT_TIME ]]; then
break
fi
# If we already visited this URL, don't do it again.
if [ ! `grep -o $u $VISITED_URLS_FILE` ]; then
$0 -t $(($RUNTIME - ($CURRENT_TIME - $START_TIME))) -e $EXPRESSION $u
fi
done
###--# Display Summary #--###
if [[ "$SUMMARY" != "" ]]; then
URLS_VISITED=`wc -l $VISITED_URLS_FILE |cut -d " " -f 1`
ELAPSED_TIME=$((`date +%s` - $START_TIME))
echo ""
echo "================================================================="
echo "Total time: $ELAPSED_TIME seconds"
echo "Total Number of pages scanned: $URLS_VISITED"
echo "Total results: `wc -l $MATCHES_FILE |cut -d " " -f 1`"
echo "Average scanning rate: `echo "scale=2; $URLS_VISITED/$ELAPSED_TIME" |bc` URLs per second"
echo "================================================================="
fi
# TODO: Something better than this (see above)
if [[ "$CRAWLER_RECURSION" == "1" ]]; then
# Clean shit up
rm $MATCHES_FILE
rm $VISITED_URLS_FILE
unset CRAWLER_RECURSION
fi
let CRAWLER_RECURSION--
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment