Last active
August 15, 2023 15:53
-
-
Save HexagonWin/64dc365a25a523eb154560b82e6fa524 to your computer and use it in GitHub Desktop.
Scrape egloos URLs from search engines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
QUERYNUM=1 | |
# Completely free to modify or whatever. Made by hexagonwin <[email protected]> | |
# This thing is FREE SOFTWARE or whatever you choose to call it | |
FIRST=1 | |
# Bing starts at FIRST=1, Google starts at FIRST=0 | |
PAGE=1 | |
UA="Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)" | |
DELAY="2" | |
OUTPUT=./out | |
VER=1.2-and | |
LOGFILE=$OUTPUT/bing_log | |
COOKIE=$OUTPUT/bing_cookie.txt | |
OUTFILE=urls | |
NewlyAdded=999 | |
#QUERY="site:*.egloos.com" | |
QUERYFILE=query | |
eko(){ | |
echo -ne "$1" | |
echo -ne "$1" >> $LOGFILE | |
} | |
Finish_Cleanup(){ | |
eko "Cleaning up.." | |
sed -i 's/ › .*//' $OUTPUT/$OUTFILE | |
eko "Sorting.." | |
sort $OUTPUT/$OUTFILE -o $OUTPUT/$OUTFILE | |
eko "Unduplicating.." | |
uniq $OUTPUT/$OUTFILE $OUTPUT/${OUTFILE}-tmp | |
cat $OUTPUT/${OUTFILE}-tmp > $OUTPUT/$OUTFILE | |
rm $OUTPUT/${OUTFILE}-tmp | |
eko "Finished!\n\n" | |
eko "We saved $(wc -l < $OUTPUT/$OUTFILE) items\n" | |
} | |
if ! [ -f "$QUERYFILE" ]; then | |
eko "Query file $QUERYFILE nonexistent. Quitting..\n" | |
exit 1 | |
fi | |
mkdir -p $OUTPUT $OUTPUT/BINGHTML | |
touch $OUTPUT/$OUTFILE | |
touch $LOGFILE | |
rm -f $COOKIE | |
eko "Init BDomainListRetrieve Agent $VER\n" | |
eko " -Log $LOGFILE, cookie $COOKIE, output to $OUTFILE on $OUTPUT, $(wc -l < $QUERYFILE) queries\n" | |
#while read -r QUERY_ORIG; do | |
while true; do | |
QUERY_ORIG=$(sed -n "${QUERYNUM}p" < $QUERYFILE) | |
# Only run when $QUERY is not empty | |
if [ -z "$QUERY_ORIG" ]; then | |
eko "Skipping #$QUERYNUM; Query is empty\n\n" | |
else | |
QUERY="$QUERY_ORIG site:egloos.com -pds" | |
eko "[$QUERY] Query #$QUERYNUM Starting\n" | |
while ! [ $NewlyAdded -eq 0 ]; do | |
# We count output lines, we get newly added ones via this | |
SavedLines=$(wc -l < $OUTPUT/$OUTFILE) | |
# We save saved html -> ./WEB/HTML/From*.htm | |
eko "[$QUERY] PAGE $PAGE (from $FIRST)" | |
STATUS=$(curl -s -w "%{http_code}" -b $COOKIE -c $COOKIE "https://www.bing.com/search?q=$(echo $QUERY | jq '@uri' -jRr)&first=$FIRST" -A "$UA" -o ${OUTPUT}/BINGHTML/Q${QUERYNUM}_P${PAGE}.htm) | |
if [ $STATUS -eq 302 ]; then | |
eko "..302!\n -> We got an IP ban." | |
rm $COOKIE | |
eko "\n -> Cleared cookie.." | |
eko "\n -> Resetting IP.." | |
CURIP=$(curl -s icanhazip.com) | |
eko "$CURIP" | |
svc data disable | |
eko "..off." | |
sleep 2 | |
svc data enable | |
eko ".on.." | |
while true; do | |
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then | |
break | |
fi | |
sleep 1 | |
done | |
eko "online" | |
sleep 2 | |
NEWIP=$(curl -s icanhazip.com) | |
eko "..$NEWIP.." | |
if [ "$CURIP" = "$NEWIP" ]; then | |
eko "ERROR!\n -> Old and new IPs are identical.\n" | |
eko " -> Trying 3G/LTE switch mitigation" | |
settings put global preferred_network_mode1 0 # Switch to 3G | |
eko "..3G." | |
# Wait until we get inet | |
while true; do | |
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then | |
break | |
fi | |
sleep 1 | |
done | |
sleep 1 # Just to prevent issues lol | |
settings put global preferred_network_mode1 9 # Switch to 4G/LTE | |
eko ".LTE.." | |
# Wait until we get inet /TODO : Make this modular | |
while true; do | |
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then | |
break | |
fi | |
sleep 1 | |
done | |
eko "online" | |
sleep 2 | |
CURIP=$(curl -s icanhazip.com) | |
eko "..$CURIP" | |
if [ "$CURIP" = "$NEWIP" ]; then | |
eko "ERROR!\n -> Old and new IPs are *still* identical.\n" | |
eko " -> Stopped at PAGE $PAGE from $FIRST on [$QUERY]\n" | |
export PAGE=$PAGE | |
export FIRST=$FIRST | |
eko " -> Please modify file $0 and edit the values of PAGE and FIRST\n" | |
eko " -> Please modify file $QUERYFILE and remove all lines before current query ($QUERY)\n" | |
eko "Quitting program..\n" | |
Finish_Cleanup | |
exit 1 | |
else | |
eko " done!\n" | |
eko " -> Mitigation successful\n" | |
continue | |
fi | |
else | |
eko " done!\n" | |
fi | |
eko " -> Re-running current action..\n" | |
continue | |
elif ! [ $STATUS -eq 200 ]; then | |
eko "..CODE $STATUS! Terminating\n" | |
exit 1 | |
else | |
eko "..200" | |
fi | |
# awk only extracts domain portion of it i.e. areaz.egloos.com from http://areaz.egloos.com/1234/ | |
pup "ol#b_results > li.b_algo > h2 > a attr{href}" < ${OUTPUT}/BINGHTML/Q${QUERYNUM}_P${PAGE}.htm | awk -F[/:] '{print $4}' >> $OUTPUT/$OUTFILE | |
NewlyAdded=$(( $(wc -l < $OUTPUT/$OUTFILE) - $SavedLines )) | |
eko "..Added $NewlyAdded\n" | |
((PAGE++)) | |
FIRST=$(($FIRST + $NewlyAdded)) | |
sed -i 's/ › .*//' $OUTPUT/$OUTFILE | |
sleep $DELAY | |
done | |
eko "[$QUERY] Query #$QUERYNUM Finished\n\n" | |
fi | |
NewlyAdded=999 | |
((QUERYNUM++)) | |
FIRST=1 | |
PAGE=1 | |
done | |
# done < "$QUERYFILE" | |
eko "\n -> Finished fetching..\n" | |
Finish_Cleanup |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Completely free to modify or whatever. Made by hexagonwin <[email protected]> | |
# This thing is FREE SOFTWARE or whatever you choose to call it | |
FIRST=150 #0 | |
PAGE=16 #1 | |
QUERYNUM=22 #1 | |
UA="Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" | |
DELAY="1" | |
OUTPUT=./out | |
VER=1.2-and | |
LOGFILE=$OUTPUT/google_log | |
COOKIE=$OUTPUT/google_cookie.txt | |
OUTFILE=urls | |
NewlyAdded=999 | |
QUERYFILE=query | |
eko(){ | |
echo -ne "$1" | |
echo -ne "$1" >> $LOGFILE | |
} | |
if ! [ -f "$QUERYFILE" ]; then | |
eko "Query file $QUERYFILE nonexistent. Quitting..\n" | |
exit 1 | |
fi | |
mkdir -p $OUTPUT $OUTPUT/GOOGLEHTML | |
touch $OUTPUT/$OUTFILE | |
touch $LOGFILE | |
rm -f $COOKIE | |
eko "Init GDomainListRetrieve Agent $VER\n" | |
eko " -Log $LOGFILE, cookie $COOKIE, output to $OUTFILE on $OUTPUT, $(wc -l < $QUERYFILE) queries\n" | |
#while read -r QUERY; do | |
while true; do | |
QUERY_ORIG=$(sed -n "${QUERYNUM}p" < $QUERYFILE) | |
# Only run when $QUERY is not empty | |
if [ -z "$QUERY_ORIG" ]; then | |
eko "Skipping #$QUERYNUM; Query is empty\n\n" | |
else | |
QUERY="$QUERY_ORIG site:egloos.com -pds" | |
eko "[$QUERY] Query #$QUERYNUM Starting\n" | |
while ! [ $NewlyAdded -eq 0 ]; do | |
# We count output lines, we get newly added ones via this | |
SavedLines=$(wc -l < $OUTPUT/$OUTFILE) | |
# We save saved html -> ./WEB/GOOGLEHTML/From*.htm | |
eko "[$QUERY] PAGE $PAGE (from $FIRST)" | |
STATUS=$(curl -s -w "%{http_code}" -b $COOKIE -c $COOKIE "https://www.google.com/search?q=$(echo $QUERY | jq '@uri' -jRr)&start=$FIRST" -A "$UA" -o ${OUTPUT}/GOOGLEHTML/Q${QUERYNUM}_P${PAGE}.htm) | |
if [ $STATUS -eq 302 ]; then | |
eko "..302!\n -> We got an IP ban." | |
rm $COOKIE | |
eko "\n -> Cleared cookie.." | |
eko "\n -> Resetting IP.." | |
CURIP=$(curl -s icanhazip.com) | |
eko "$CURIP" | |
svc data disable | |
eko "..off." | |
sleep 2 | |
svc data enable | |
eko ".on.." | |
# Wait until we get inet | |
while true; do | |
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then | |
break | |
fi | |
sleep 1 | |
done | |
eko "online" | |
sleep 2 | |
NEWIP=$(curl -s icanhazip.com) | |
eko "..$NEWIP.." | |
if [ "$CURIP" = "$NEWIP" ]; then | |
eko "ERROR!\n -> Old and new IPs are identical.\n" | |
eko " -> Trying 3G/LTE switch mitigation" | |
settings put global preferred_network_mode1 0 # Switch to 3G | |
eko "..3G." | |
# Wait until we get inet | |
while true; do | |
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then | |
break | |
fi | |
sleep 1 | |
done | |
sleep 1 # Just to prevent issues lol | |
settings put global preferred_network_mode1 9 # Switch to 4G/LTE | |
eko ".LTE.." | |
# Wait until we get inet /TODO : Make this modular | |
while true; do | |
if [ $(ip add sh dev rmnet0 | grep inet | wc -l) -ne 0 ]; then | |
break | |
fi | |
sleep 1 | |
done | |
eko "online" | |
sleep 2 | |
CURIP=$(curl -s icanhazip.com) | |
eko "..$CURIP" | |
if [ "$CURIP" = "$NEWIP" ]; then | |
eko "ERROR!\n -> Old and new IPs are *still* identical.\n" | |
eko " -> Stopped at PAGE $PAGE from $FIRST on [$QUERY]\n" | |
export PAGE=$PAGE | |
export FIRST=$FIRST | |
eko " -> Please modify file $0 and edit the values of PAGE and FIRST\n" | |
eko " -> Please modify file $QUERYFILE and remove all lines before current query ($QUERY)\n" | |
eko "Quitting program..\n" | |
exit 1 | |
else | |
eko " done!\n" | |
eko " -> Mitigation successful\n" | |
continue | |
fi | |
else | |
eko " done!\n" | |
fi | |
eko " -> Re-running current action..\n" | |
continue | |
elif ! [ $STATUS -eq 200 ]; then | |
eko "..CODE $STATUS! Terminating\n" | |
exit 1 | |
else | |
eko "..200" | |
fi | |
pup "body > div:nth-child(3) > div > div > div > div > div > a > span:nth-child(2) > span text{}" < ${OUTPUT}/GOOGLEHTML/Q${QUERYNUM}_P${PAGE}.htm >> $OUTPUT/$OUTFILE | |
NewlyAdded=$(( $(wc -l < $OUTPUT/$OUTFILE) - $SavedLines )) | |
eko "..Added $NewlyAdded\n" | |
((PAGE++)) | |
FIRST=$(($FIRST + $NewlyAdded)) | |
sed -i 's/ › .*//' $OUTPUT/$OUTFILE | |
sleep $DELAY | |
done | |
NewlyAdded=999 | |
eko "[$QUERY] Query #$QUERYNUM Finished\n\n" | |
fi | |
((QUERYNUM++)) | |
FIRST=0 | |
PAGE=1 | |
done | |
# done < "$QUERYFILE" | |
eko "\n -> Finished fetching..\n" | |
eko "Cleaning up.." | |
sed -i 's/ › .*//' $OUTPUT/$OUTFILE | |
eko "Sorting.." | |
sort $OUTPUT/$OUTFILE -o $OUTPUT/$OUTFILE | |
eko "Unduplicating.." | |
uniq $OUTPUT/$OUTFILE $OUTPUT/${OUTFILE}-tmp | |
cat $OUTPUT/${OUTFILE}-tmp > $OUTPUT/$OUTFILE | |
rm $OUTPUT/${OUTFILE}-tmp | |
eko "Finished!\n\n" | |
eko "We saved $(wc -l < $OUTPUT/$OUTFILE) items\n" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment