Created
January 15, 2012 19:12
-
-
Save r/1616875 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run the 'host' command, but timeout after 30 seconds. | |
# | |
# args: | |
# - parameter to pass to 'host' | |
# - file to send output of 'host' to | |
# | |
# returns: | |
# 1 => lookup timedout | |
# 0 => successful run | |
function timeout_host() { | |
local timeout=30 | |
host $1 > $2 & | |
local PID=$! | |
while [ $timeout -gt 0 ]; do | |
kill -0 $PID > /dev/null 2>&1 | |
[ $? -eq 1 ] && break | |
timeout=$((timeout-1)) | |
sleep 1 | |
done | |
if [ $timeout -eq 0 ]; then | |
kill -9 $PID | |
return 1 | |
else | |
return 0 | |
fi | |
} | |
# given an IP address, determine or not whether it is a googlebot host | |
# | |
# args: | |
# - ip address to evaluate | |
# | |
# returns: | |
# 0 => host is a googlebot host | |
# 1 => host is not a googlebot host | |
# 2 or 3 => 'host' function timed out | |
function is_googlebot_ip() { | |
local is_googlebot=1 | |
local temp_filename=`mktemp` | |
timeout_host $1 $temp_filename | |
[ $? -eq 1 ] && return 2 | |
cat $temp_filename | grep googlebot > /dev/null | |
if [ $? -eq 0 ]; then | |
local candidate_hostname=`cat $temp_filename | sed 's/.*pointer[ ]*\(.*\)/\1/'` | |
timeout_host $candidate_hostname $temp_filename | |
[ $? -eq 1 ] && return 3 | |
local address=`cat $temp_filename | sed 's/.*address[ ]*\(.*\)/\1/'` | |
[ $candidate_address == $address ] && is_googlebot=0 | |
fi | |
rm -f $temp_filename | |
return $is_googlebot | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks for the comments, phil! i'll make some modifications.
in reality, the biggest problem is that doing this type of lookup, en masse, is really slow because the 'host' lookup takes way too long. i'll post a new gist of the java version i hacked together, using InetAddress, and running in$n$ threads (where $n$ usually > 100) simultaneously.