Last active
July 5, 2017 04:20
-
-
Save T31337/b05dfa84d6dbd77c87eed8c92c713fe0 to your computer and use it in GitHub Desktop.
Spider Using xidel, xargs, and curl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
spiderFile="spider.txt" | |
spiderInfo="spiderInfo.txt" | |
errorFile="html_errors.txt" | |
ok="spiderList.txt" | |
prefix="http://" | |
curl=$(which curl) | |
spiderBin=$(which xidel) | |
xargsBin=$(which xargs) | |
if [[ -z $curl ]]; then | |
echo "You Need To Install curl To Use This Script" | |
exit 1 | |
fi | |
if [[ -z $spiderBin ]]; then | |
echo "You Need To Install xeidl To Use This Script" | |
exit 1 | |
fi | |
if [[ -z $xargsBin ]]; then | |
echo "You Need To Install xargs To Use This Script" | |
exit 1 | |
fi | |
# Allow more advanced pattern matching (for case..esac below) | |
shopt -s extglob | |
function checkURLS | |
{ | |
echo "Removing Old Outdated Files..." | |
rm -rf $ok | |
rm -rf $errorFile | |
echo "Checking URLS..." | |
while read url; do | |
# remove comments | |
url=${url%%#*} | |
# skip empty lines | |
if [[ -z "$url" ]]; then | |
continue | |
fi | |
if [[ "$url" != "http"* ]];then | |
url=$prefix$url | |
fi | |
# Handle just ftp, http and https. | |
# We could do full URL pattern matching, but meh. | |
case "$url" in | |
@(f|ht)tp?(s)://*) | |
# Get just the numeric HTTP response code | |
http_code=$(curl -sL -w '%{http_code}' "$url" -o /dev/null) | |
case "$http_code" in | |
200|226|2*) | |
# You'll get a 226 in ${http_code} from a valid FTP URL. | |
# If all you really care about is that the response is in the 200's, | |
# you could match against "2??" instead. | |
echo -e "$url" >> $ok | |
;; | |
*) | |
# You might want different handling for redirects (301/302). | |
echo -e "$url | $http_code" >> $errorFile | |
;; | |
esac | |
;; | |
*) | |
# If we're here, we didn't get a URL we could read. | |
echo "WARNING: invalid url: $url" | |
echo -e "$url | $http_code | INVALID URL " >> $errorFile | |
;; | |
esac | |
done < "$spiderFile" | |
echo "$ok Created Form Valid URLS In $spiderFlle" | |
echo -e "============================================\n" | |
cat $ok | |
echo -e "============================================\n" | |
} | |
function wSpider1 | |
{ | |
wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose >> $wgetLog | |
} | |
function spider | |
{ | |
if [[ -f $spiderInfo ]]; then | |
echo -e "Found Old SpiderData...\n" | |
echo -e "Backing Up Old SpiderData File: $spiderInfo -> ${spiderInfo}.old" | |
echo -e "==================================================================\n" | |
echo -e "Adding Last SpiderInfo To Old SpiderInfo File...\n" | |
echo -e "==================================================\n" >> "${spiderInfo}.old" | |
echo -e $spiderInfo >> "${spiderInfo}.old" | |
echo -e "Done!\nRemoving SpiderInfo File So We Can Start Fresh!\n" | |
rm -f $spiderInfo | |
fi | |
clear | |
echo -e "=====================================\n" | |
echo -e "Spidering WebSite...\n\nPlease Wait...\n" | |
echo -e "======================================\n" | |
#xSpider=$(xidel $1 -e '//a/@href' | grep -v "http" | sort -u | xargs -L1 -I {} xidel $1{} -e '//a/@href' | grep -v "http" | sort -u) | |
#xSpider="$(xidel $1 -e '//a/@href' | grep -v 'http' | sort -u | xargs -L1 -I {} xidel $1{} -e '//a/@href' | grep -v 'http' | sort -u)" | |
#echo -e "==================================================\n" | |
echo -e "Spidering WebSite: $1...\n" | |
hSpider=$(httrack --continue $1 --spider --list "${spiderInfo}_httrack") | |
wgetLog="${spiderInfo}_wget" | |
wSpider1=$(wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose -o $wgetLog) | |
wSpider2=$(wget --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose $1 >> $wgetLog) | |
xSpider="$(xidel $1 -e '//a/@href' | grep -v 'http' | sort -u | xargs -L1 -I {} xidel $1{} -e '//a/@href' | grep -v 'http' | sort -u)" | |
echo -e "=================\n" >> $spiderInfo | |
echo -e "===xSpiderInfo:===\n" >> $spiderInfo | |
echo -e $xSpider >> $spiderInfo | |
echo -e "\n=================================\n" | |
echo -e " Starting wSpider1..." | |
echo -e "=========wSpiderInfo:==============\n" | |
wSpider1 $1 | |
if [[ -f $wgetLog ]]; then | |
sed -e 's/\s\+/\n/g' $wgetLog >> $spiderInfo | |
fi | |
sed -e 's/\s\+/\n/g' $spiderInfo >> "new_${spiderInfo}" | |
mv -f "new_${spiderInfo}" $spiderInfo | |
echo -e "==========END OF CRAWL========\n" >> $spiderInfo | |
cat $spiderInfo | |
echo -e "Removing Some Files That Are No-Longer Needed...\n" | |
rm -f "${spiderInfo}_wget" | |
rm -f "${spiderInfo}_httrack" | |
exit 1 | |
} | |
if [ ! -f $spiderFile ] || [ ! -s $spiderFile ]; then | |
echo "SpiderFile $spiderFile Is Missing Or Empty..." | |
echo "Creating Deafult SpiderFile..." | |
touch $spiderFile | |
echo "#http://www.google.com" >> $spiderFile | |
echo "#http://www.facebook.com" >> $spiderFile | |
echo "https://plus.google.com/collection" >> $spiderFile | |
echo "Done!" | |
fi | |
if [ $# -eq 0 ]; then | |
clear | |
checkURLS | |
echo "Checking For Spider File: $ok" | |
if [ ! -f $ok ] || [ ! -s $ok ]; then | |
echo -e "$ok Not Found Or Empty\n" | |
read -p "URL: " site | |
if [[ $site != "http"* ]];then | |
site=$prefix$site | |
fi | |
spider $site | |
else | |
while read data; do | |
# remove comments | |
url=${data%%#*} | |
# skip empty lines | |
if [[ -z "$url" ]]; then | |
continue | |
fi | |
if [[ $url != "http"* ]];then | |
url=$prefix$url | |
fi | |
spider $url | |
done < $ok | |
echo -e "Done!\n" | |
echo -e "============\n" | |
echo -e "SpiderData:\n" | |
echo -e "============\n" | |
cat $spiderInfo | |
fi | |
fi | |
if [[ $# -eq 1 ]]; then | |
if [[ $1 != "http"* ]];then | |
site=$prefix$1 | |
echo -e "Spidering WebSite: ${site}...\n" | |
spider ${site} | |
else | |
echo -e "Spidering WebSite: $1...\n" | |
spider $1 | |
fi | |
fi | |
echo "Thanks For Spidering..." | |
exit 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment