Skip to content

Instantly share code, notes, and snippets.

@HexagonWin
Last active August 15, 2023 15:53
Show Gist options
  • Save HexagonWin/64dc365a25a523eb154560b82e6fa524 to your computer and use it in GitHub Desktop.
Save HexagonWin/64dc365a25a523eb154560b82e6fa524 to your computer and use it in GitHub Desktop.
Scrape egloos URLs from search engines
QUERYNUM=1
# Completely free to modify or whatever. Made by hexagonwin <[email protected]>
# This thing is FREE SOFTWARE or whatever you choose to call it
FIRST=1
# Bing starts at FIRST=1, Google starts at FIRST=0
PAGE=1
UA="Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
DELAY="2"
OUTPUT=./out
VER=1.2-and
LOGFILE=$OUTPUT/bing_log
COOKIE=$OUTPUT/bing_cookie.txt
OUTFILE=urls
NewlyAdded=999
#QUERY="site:*.egloos.com"
QUERYFILE=query
eko(){
echo -ne "$1"
echo -ne "$1" >> $LOGFILE
}
Finish_Cleanup(){
eko "Cleaning up.."
sed -i 's/ › .*//' $OUTPUT/$OUTFILE
eko "Sorting.."
sort $OUTPUT/$OUTFILE -o $OUTPUT/$OUTFILE
eko "Unduplicating.."
uniq $OUTPUT/$OUTFILE $OUTPUT/${OUTFILE}-tmp
cat $OUTPUT/${OUTFILE}-tmp > $OUTPUT/$OUTFILE
rm $OUTPUT/${OUTFILE}-tmp
eko "Finished!\n\n"
eko "We saved $(wc -l < $OUTPUT/$OUTFILE) items\n"
}
if ! [ -f "$QUERYFILE" ]; then
eko "Query file $QUERYFILE nonexistent. Quitting..\n"
exit 1
fi
mkdir -p $OUTPUT $OUTPUT/BINGHTML
touch $OUTPUT/$OUTFILE
touch $LOGFILE
rm -f $COOKIE
eko "Init BDomainListRetrieve Agent $VER\n"
eko " -Log $LOGFILE, cookie $COOKIE, output to $OUTFILE on $OUTPUT, $(wc -l < $QUERYFILE) queries\n"
#while read -r QUERY_ORIG; do
while true; do
QUERY_ORIG=$(sed -n "${QUERYNUM}p" < $QUERYFILE)
# Only run when $QUERY is not empty
if [ -z "$QUERY_ORIG" ]; then
eko "Skipping #$QUERYNUM; Query is empty\n\n"
else
QUERY="$QUERY_ORIG site:egloos.com -pds"
eko "[$QUERY] Query #$QUERYNUM Starting\n"
while ! [ $NewlyAdded -eq 0 ]; do
# We count output lines, we get newly added ones via this
SavedLines=$(wc -l < $OUTPUT/$OUTFILE)
# We save saved html -> ./WEB/HTML/From*.htm
eko "[$QUERY] PAGE $PAGE (from $FIRST)"
STATUS=$(curl -s -w "%{http_code}" -b $COOKIE -c $COOKIE "https://www.bing.com/search?q=$(echo $QUERY | jq '@uri' -jRr)&first=$FIRST" -A "$UA" -o ${OUTPUT}/BINGHTML/Q${QUERYNUM}_P${PAGE}.htm)
if [ $STATUS -eq 302 ]; then
eko "..302!\n -> We got an IP ban."
rm $COOKIE
eko "\n -> Cleared cookie.."
eko "\n -> Resetting IP.."
CURIP=$(curl -s icanhazip.com)
eko "$CURIP"
svc data disable
eko "..off."
sleep 2
svc data enable
eko ".on.."
while true; do
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then
break
fi
sleep 1
done
eko "online"
sleep 2
NEWIP=$(curl -s icanhazip.com)
eko "..$NEWIP.."
if [ "$CURIP" = "$NEWIP" ]; then
eko "ERROR!\n -> Old and new IPs are identical.\n"
eko " -> Trying 3G/LTE switch mitigation"
settings put global preferred_network_mode1 0 # Switch to 3G
eko "..3G."
# Wait until we get inet
while true; do
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then
break
fi
sleep 1
done
sleep 1 # Just to prevent issues lol
settings put global preferred_network_mode1 9 # Switch to 4G/LTE
eko ".LTE.."
# Wait until we get inet /TODO : Make this modular
while true; do
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then
break
fi
sleep 1
done
eko "online"
sleep 2
CURIP=$(curl -s icanhazip.com)
eko "..$CURIP"
if [ "$CURIP" = "$NEWIP" ]; then
eko "ERROR!\n -> Old and new IPs are *still* identical.\n"
eko " -> Stopped at PAGE $PAGE from $FIRST on [$QUERY]\n"
export PAGE=$PAGE
export FIRST=$FIRST
eko " -> Please modify file $0 and edit the values of PAGE and FIRST\n"
eko " -> Please modify file $QUERYFILE and remove all lines before current query ($QUERY)\n"
eko "Quitting program..\n"
Finish_Cleanup
exit 1
else
eko " done!\n"
eko " -> Mitigation successful\n"
continue
fi
else
eko " done!\n"
fi
eko " -> Re-running current action..\n"
continue
elif ! [ $STATUS -eq 200 ]; then
eko "..CODE $STATUS! Terminating\n"
exit 1
else
eko "..200"
fi
# awk only extracts domain portion of it i.e. areaz.egloos.com from http://areaz.egloos.com/1234/
pup "ol#b_results > li.b_algo > h2 > a attr{href}" < ${OUTPUT}/BINGHTML/Q${QUERYNUM}_P${PAGE}.htm | awk -F[/:] '{print $4}' >> $OUTPUT/$OUTFILE
NewlyAdded=$(( $(wc -l < $OUTPUT/$OUTFILE) - $SavedLines ))
eko "..Added $NewlyAdded\n"
((PAGE++))
FIRST=$(($FIRST + $NewlyAdded))
sed -i 's/ › .*//' $OUTPUT/$OUTFILE
sleep $DELAY
done
eko "[$QUERY] Query #$QUERYNUM Finished\n\n"
fi
NewlyAdded=999
((QUERYNUM++))
FIRST=1
PAGE=1
done
# done < "$QUERYFILE"
eko "\n -> Finished fetching..\n"
Finish_Cleanup
# Completely free to modify or whatever. Made by hexagonwin <[email protected]>
# This thing is FREE SOFTWARE or whatever you choose to call it
FIRST=150 #0
PAGE=16 #1
QUERYNUM=22 #1
UA="Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
DELAY="1"
OUTPUT=./out
VER=1.2-and
LOGFILE=$OUTPUT/google_log
COOKIE=$OUTPUT/google_cookie.txt
OUTFILE=urls
NewlyAdded=999
QUERYFILE=query
eko(){
echo -ne "$1"
echo -ne "$1" >> $LOGFILE
}
if ! [ -f "$QUERYFILE" ]; then
eko "Query file $QUERYFILE nonexistent. Quitting..\n"
exit 1
fi
mkdir -p $OUTPUT $OUTPUT/GOOGLEHTML
touch $OUTPUT/$OUTFILE
touch $LOGFILE
rm -f $COOKIE
eko "Init GDomainListRetrieve Agent $VER\n"
eko " -Log $LOGFILE, cookie $COOKIE, output to $OUTFILE on $OUTPUT, $(wc -l < $QUERYFILE) queries\n"
#while read -r QUERY; do
while true; do
QUERY_ORIG=$(sed -n "${QUERYNUM}p" < $QUERYFILE)
# Only run when $QUERY is not empty
if [ -z "$QUERY_ORIG" ]; then
eko "Skipping #$QUERYNUM; Query is empty\n\n"
else
QUERY="$QUERY_ORIG site:egloos.com -pds"
eko "[$QUERY] Query #$QUERYNUM Starting\n"
while ! [ $NewlyAdded -eq 0 ]; do
# We count output lines, we get newly added ones via this
SavedLines=$(wc -l < $OUTPUT/$OUTFILE)
# We save saved html -> ./WEB/GOOGLEHTML/From*.htm
eko "[$QUERY] PAGE $PAGE (from $FIRST)"
STATUS=$(curl -s -w "%{http_code}" -b $COOKIE -c $COOKIE "https://www.google.com/search?q=$(echo $QUERY | jq '@uri' -jRr)&start=$FIRST" -A "$UA" -o ${OUTPUT}/GOOGLEHTML/Q${QUERYNUM}_P${PAGE}.htm)
if [ $STATUS -eq 302 ]; then
eko "..302!\n -> We got an IP ban."
rm $COOKIE
eko "\n -> Cleared cookie.."
eko "\n -> Resetting IP.."
CURIP=$(curl -s icanhazip.com)
eko "$CURIP"
svc data disable
eko "..off."
sleep 2
svc data enable
eko ".on.."
# Wait until we get inet
while true; do
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then
break
fi
sleep 1
done
eko "online"
sleep 2
NEWIP=$(curl -s icanhazip.com)
eko "..$NEWIP.."
if [ "$CURIP" = "$NEWIP" ]; then
eko "ERROR!\n -> Old and new IPs are identical.\n"
eko " -> Trying 3G/LTE switch mitigation"
settings put global preferred_network_mode1 0 # Switch to 3G
eko "..3G."
# Wait until we get inet
while true; do
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then
break
fi
sleep 1
done
sleep 1 # Just to prevent issues lol
settings put global preferred_network_mode1 9 # Switch to 4G/LTE
eko ".LTE.."
# Wait until we get inet /TODO : Make this modular
while true; do
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then
break
fi
sleep 1
done
eko "online"
sleep 2
CURIP=$(curl -s icanhazip.com)
eko "..$CURIP"
if [ "$CURIP" = "$NEWIP" ]; then
eko "ERROR!\n -> Old and new IPs are *still* identical.\n"
eko " -> Stopped at PAGE $PAGE from $FIRST on [$QUERY]\n"
export PAGE=$PAGE
export FIRST=$FIRST
eko " -> Please modify file $0 and edit the values of PAGE and FIRST\n"
eko " -> Please modify file $QUERYFILE and remove all lines before current query ($QUERY)\n"
eko "Quitting program..\n"
exit 1
else
eko " done!\n"
eko " -> Mitigation successful\n"
continue
fi
else
eko " done!\n"
fi
eko " -> Re-running current action..\n"
continue
elif ! [ $STATUS -eq 200 ]; then
eko "..CODE $STATUS! Terminating\n"
exit 1
else
eko "..200"
fi
pup "body > div:nth-child(3) > div > div > div > div > div > a > span:nth-child(2) > span text{}" < ${OUTPUT}/GOOGLEHTML/Q${QUERYNUM}_P${PAGE}.htm >> $OUTPUT/$OUTFILE
NewlyAdded=$(( $(wc -l < $OUTPUT/$OUTFILE) - $SavedLines ))
eko "..Added $NewlyAdded\n"
((PAGE++))
FIRST=$(($FIRST + $NewlyAdded))
sed -i 's/ › .*//' $OUTPUT/$OUTFILE
sleep $DELAY
done
NewlyAdded=999
eko "[$QUERY] Query #$QUERYNUM Finished\n\n"
fi
((QUERYNUM++))
FIRST=0
PAGE=1
done
# done < "$QUERYFILE"
eko "\n -> Finished fetching..\n"
eko "Cleaning up.."
sed -i 's/ › .*//' $OUTPUT/$OUTFILE
eko "Sorting.."
sort $OUTPUT/$OUTFILE -o $OUTPUT/$OUTFILE
eko "Unduplicating.."
uniq $OUTPUT/$OUTFILE $OUTPUT/${OUTFILE}-tmp
cat $OUTPUT/${OUTFILE}-tmp > $OUTPUT/$OUTFILE
rm $OUTPUT/${OUTFILE}-tmp
eko "Finished!\n\n"
eko "We saved $(wc -l < $OUTPUT/$OUTFILE) items\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment