Created
November 27, 2014 11:29
-
-
Save icedraco/b837c80c96f10608511b to your computer and use it in GitHub Desktop.
A web crawler done in BASH. Written for a friend at the Information Systems Engineering department.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
### | |
# A small script that recursively crawls a URL and fetches e-mail addresses from | |
# all the pages it can find. Written for a friend studying BASH for his Systems | |
# Engineering degree. | |
# | |
# Author: IceDragon <[email protected]> | |
# Contact: http://www.icerealm.org/contact | |
# | |
###--# CONFIGURATION #--########################################################### | |
# This is where we store URLs we've already been to, so we won't visit them again. | |
VISITED_URLS_FILE="/tmp/crawler_visited_urls" | |
# This is where we store the matches (e-mails or whatever they wanted), just so we | |
# could count it in the summary - not because we need it elsewhere :D | |
MATCHES_FILE="/tmp/crawler_matches" | |
# Regular expression to extract URLs with from within each HTML file. | |
EXPR_URL_MATCH="http://[^'\"]\+\.html" | |
# Default expression to look for in case and -e wasn't specified by the user. This | |
# grabs all the e-mail addresses from each page we go through. | |
EXPRESSION="[A-Za-z0-9._%\\+-]\\+@[A-Za-z0-9.-]\\+\\.[A-Za-z]\\{2,6\\}" | |
# Amount of seconds to run if no runtime was specified. | |
RUNTIME=10 | |
# Request timeout for busted servers (so we don't wait over a minute for it to fail) | |
WGET_TIMEOUT=10 | |
WGET_RETRIES=0 | |
###--# EXECUTION #--############################################################## | |
START_TIME=`date +%s` | |
# TODO: Something better than this. If this variable is set, we're in mid-recursion | |
# (or some idiot user stopped the script in mid-operation) | |
if [[ "$CRAWLER_RECURSION" == "" ]]; then | |
# Remove stale data files from previous runs (if any) | |
rm -f $VISITED_URLS_FILE | |
rm -f $MATCHES_FILE | |
export CRAWLER_RECURSION=1 | |
# Create files so there won't be errors reading them before they're formed. | |
touch $VISITED_URLS_FILE $MATCHES_FILE | |
else | |
let CRAWLER_RECURSION++ | |
fi | |
# Process parameters from user | |
while getopts e:t:s OPTION; do | |
case "$OPTION" in | |
e) | |
EXPRESSION=$OPTARG | |
;; | |
t) | |
RUNTIME=$OPTARG | |
;; | |
s) | |
SUMMARY=1 | |
;; | |
*) | |
echo "nothing" | |
;; | |
esac | |
done | |
# If they specified a zero or negative runtime, then we won't have time to do | |
# anything, obviously. So we quit here as work done. | |
# | |
# The user is usually smarter than that, but recursions won't be. That's how we | |
# exit one. | |
if [[ $RUNTIME -le 0 ]]; then | |
exit 0 | |
fi | |
# Figure out a URL from the user. If they didn't specify any, $URL will be blank | |
while shift; do | |
if [[ "$1" == *://* ]]; then | |
URL=$1 | |
break | |
fi | |
done | |
# Show us where you are at the moment (in case and there are problems) | |
if [[ "$DEBUG" != "" ]]; then | |
echo ">> $URL / T=$RUNTIME / CRAWLER_RECURSION=$CRAWLER_RECURSION" | |
fi | |
# Sanity check on the arguments | |
if [ "$URL" == "" ]; then | |
echo "Syntax: $0 [-e expression] [-t runtime] [-s] URL" | |
exit 1 | |
fi | |
# Grab the data from the URL we were given. | |
TMP_PAGE=`mktemp` | |
wget $URL -t $WGET_RETRIES -T $WGET_TIMEOUT -qO $TMP_PAGE | |
echo $URL >> $VISITED_URLS_FILE | |
# If something terrible happened, let the caller know. | |
if [ $? != 0 ]; then | |
# Clean the temp file up, because noone else will do it | |
rm $TMP_PAGE | |
exit $? | |
fi | |
# Extract results and display them along with the URL to the user. | |
# Storing each one into a file for counting purposes, as well... | |
RESULTS=`grep -no $EXPRESSION $TMP_PAGE` | |
for result in $RESULTS; do | |
echo "$URL $result" | |
# Split line number and the data itself into different variables. | |
read LINE DATA <<< `echo $result |tr ":" " "` | |
echo $DATA >> $MATCHES_FILE | |
done | |
# Extract the URLs from within the data we received. | |
URL_LIST=`grep -o $EXPR_URL_MATCH $TMP_PAGE` | |
# We don't need $TMP_PAGE file anymore - we got everything we wanted from | |
# it. Clean up after ourselves. | |
rm $TMP_PAGE | |
# Go through each URL and activate this very script to handle the rest | |
# (a recursion, if you will) | |
for u in $URL_LIST; do | |
CURRENT_TIME=`date +%s` | |
# Check if we have time for it. | |
if [[ $(($START_TIME + $RUNTIME)) -le $CURRENT_TIME ]]; then | |
break | |
fi | |
# If we already visited this URL, don't do it again. | |
if [ ! `grep -o $u $VISITED_URLS_FILE` ]; then | |
$0 -t $(($RUNTIME - ($CURRENT_TIME - $START_TIME))) -e $EXPRESSION $u | |
fi | |
done | |
###--# Display Summary #--### | |
if [[ "$SUMMARY" != "" ]]; then | |
URLS_VISITED=`wc -l $VISITED_URLS_FILE |cut -d " " -f 1` | |
ELAPSED_TIME=$((`date +%s` - $START_TIME)) | |
echo "" | |
echo "=================================================================" | |
echo "Total time: $ELAPSED_TIME seconds" | |
echo "Total Number of pages scanned: $URLS_VISITED" | |
echo "Total results: `wc -l $MATCHES_FILE |cut -d " " -f 1`" | |
echo "Average scanning rate: `echo "scale=2; $URLS_VISITED/$ELAPSED_TIME" |bc` URLs per second" | |
echo "=================================================================" | |
fi | |
# TODO: Something better than this (see above) | |
if [[ "$CRAWLER_RECURSION" == "1" ]]; then | |
# Clean shit up | |
rm $MATCHES_FILE | |
rm $VISITED_URLS_FILE | |
unset CRAWLER_RECURSION | |
fi | |
let CRAWLER_RECURSION-- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment