Created
March 27, 2012 00:57
-
-
Save gplv2/2211303 to your computer and use it in GitHub Desktop.
Apache bot control system, filter out spiders good and bad crawlers/ webspiders when they hit your server hard, like googlebot , bingbot. Block all them for specific places marked in the robots.txt to not visit (yet they do sometimes).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To relieve servers | |
##Imagine a robots.txt file like this (Google understands this format): | |
#User-agent: * | |
#Disallow: /detailed | |
#Disallow: /?action=detailed | |
#Disallow: /*/detailed | |
#Crawl-delay: 20 | |
## | |
# to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive | |
# RewriteEngine On | |
# RewriteOptions Inherit | |
# Then this will work in your virtualservers as wel as in the main, except for those you don't set up | |
# And you want to enforce those policies, you can do this: | |
# put this below in your httpd.conf file | |
RewriteEngine On | |
# Set a general robots.txt file (For all virtual hosts here) to a file apache can access | |
RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L] | |
# a robots.txt file is the now the only file that will be allowed to be downloaded by any bots blocked here below | |
# Block fake google when it's not coming from their IP range's (A fake googlebot) [F] => Failure | |
RewriteCond %{HTTP:X-FORWARDED-FOR} !^66\.249\.(6[4-9]|[78][0-9]|9[0-5])\. | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC] | |
RewriteRule .* - [F,L] | |
# End if match | |
# IF THE UA STARTS WITH THESE | |
RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(cheesebot|cherrypicker|chinaclaw|collector|copier|copyrightcheck) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(cosmos|crescent|curl|custo|da|diibot|disco|dittospyder|dragonfly) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(drip|easydl|ebingbong|ecatch|eirgrabber|emailcollector|emailsiphon) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(emailwolf|erocrawler|exabot|eyenetie|filehound|flashget|flunky) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(frontpage|getright|getweb|go.?zilla|go-ahead-got-it|gotit|grabnet) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(grafula|harvest|hloader|hmview|httplib|httrack|humanlinks|ilsebot) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(infonavirobot|infotekies|intelliseek|interget|iria|jennybot|jetcar) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(joc|justview|jyxobot|kenjin|keyword|larbin|leechftp|lexibot|lftp|libweb) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(likse|linkscan|linkwalker|lnspiderguy|lwp|magnet|mag-net|markwatch) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(mata.?hari|memo|microsoft.?url|midown.?tool|miixpc|mirror|missigua) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(mister.?pix|moget|mozilla.?newt|nameprotect|navroad|backdoorbot|nearsite) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(net.?vampire|netants|netcraft|netmechanic|netspider|nextgensearchbot) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(attach|nicerspro|nimblecrawler|npbot|octopus|offline.?explorer) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(offline.?navigator|openfind|outfoxbot|pagegrabber|papa|pavuk) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(pcbrowser|php.?version.?tracker|pockey|propowerbot|prowebwalker) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(psbot|pump|queryn|recorder|realdownload|reaper|reget|true_robot) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(repomonkey|rma|internetseer|sitesnagger|siphon|slysearch|smartdownload) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(snake|snapbot|snoopy|sogou|spacebison|spankbot|spanner|sqworm|superbot) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|takeout|teleport) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR] | |
RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC] | |
# ISSUE 403 / SERVE ERRORDOCUMENT | |
RewriteRule . - [F,L] | |
# End if match | |
# Block real Engines , not respecting robots.txt but allowing correct calls to pass (all detail searches basically) | |
# It seemst to take about 2 days of 403 to make it respect the robots.txt file, even though this one got downloaded several times | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR] | |
# Bing | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR] | |
# msnbot | |
RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR] | |
# Slurp | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC] | |
# block all detail searches, the rest may pass (things like /detailed, /EN/detailed and ?action=detailed | |
RewriteCond %{REQUEST_URI} ^(/detailed|/[A-Z]{2}/detailed/) [OR] | |
# or with the action=detailed key set | |
RewriteCond %{QUERY_STRING} action=detailed | |
# ISSUE 403 / SERVE ERRORDOCUMENT | |
RewriteRule .* - [F,L] | |
# End if match | |
# Defenite blocks | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR] | |
# Baidus | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR] | |
# Deepspider | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR] | |
# Known user agent strings defenitely belonging to bad spiders | |
RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR] | |
# Yandex (russian google) | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ YandexBot/3\.[01];\ \+http://yandex\.com/bots\)$ [NC,OR] | |
# Pingdom | |
RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.pingdom\.com/\) [NC,OR] | |
# AhrefsBot | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR] | |
# Block a rogue facebook application ? | |
#RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR] | |
# | |
# Vagabondo | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR] | |
# Ezooms | |
RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR] | |
# Monalisa | |
RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact\ at\ synthesio\ dot\ fr\)$ [NC,OR] | |
# ANYWHERE IN UA -- GREEDY REGEX | |
RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC] | |
# ISSUE 403 / SERVE ERRORDOCUMENT | |
RewriteRule . - [F,L] | |
# End if match |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment