Created
April 1, 2013 21:15
-
-
Save adamloving/5287815 to your computer and use it in GitHub Desktop.
Search strings for common user agents to be filtered out of web page view statistics.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# h = {}; Event.all.each { |e| h[e.useragent] = (h[e.useragent] || 0) + 1; } | |
# h.keys.each { |k| puts "#{h[k]},#{k}" | |
class UserAgent | |
# gplus => "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0" | |
BOTS = [ | |
{ :name => 'AppEngine', :match => 'AppEngine', :url => 'http://code.google.com/appengine'}, | |
{ :name => 'Embedly', :match => 'Embedly', :url => 'http://support.embed.ly/'}, | |
{ :name => 'news.me', :match => 'news.me', :url => ''}, | |
{ :name => 'Voyager', :match => 'Voyager/1.0', :url => ''}, | |
{ :name => 'bitlybot', :match => 'bitlybot', :url => ''}, | |
{ :name => 'InAGist', :match => 'InAGist', :url => 'http://inagist.com'}, | |
{ :name => 'Crowsnest', :match => 'Crowsnest', :url => 'http://www.crowsnest.tv'}, | |
{ :name => 'Apache-HttpClient', :match => 'Apache-HttpClient', :url => ''}, | |
{ :name => 'RockMeltEmbedService', :match => 'RockMeltEmbedService', :url => ''}, | |
{ :name => 'ShowyouBot', :match => 'ShowyouBot', :url => 'http://showyou.com/crawler'}, | |
{ :name => 'Jakarta Commons', :match => 'Jakarta Commons', :url => ''}, | |
{ :name => 'Java', :match => 'Java', :url => ''}, | |
{ :name => 'CURL', :match => 'CURL', :url => ''}, | |
{ :name => 'PycURL', :match => 'PycURL', :url => ''}, | |
{ :name => 'percbotspider', :match => 'percbotspider', :url => ''}, | |
{ :name => 'strawberryj.am', :match => 'strawberryj.am', :url => 'http://strawberryj.am'}, | |
{ :name => 'Flipboard', :match => 'FlipboardProxy', :url => 'http://flipboard.com/browserproxy'}, | |
{ :name => 'Google Feedfetcher', :match => 'Feedfetcher', :url => 'http://www.google.com/feedfetcher.html'}, | |
{ :name => 'Google Plus', :match => '20110814', :url => 'http://plus.google.com'}, | |
{ :name => 'EventMachine', :match => 'EventMachine', :url => 'http://rubyeventmachine.com/' }, | |
{ :name => 'Python urllib', :match => 'urllib', :url => 'http://docs.python.org/library/urllib.html' }, | |
{ :name => 'Gnip', :match => 'gnip.com', :url => 'http://gnip.com' }, | |
{ :name => 'Paper.li', :match => 'PaperLiBot', :url => 'http://paper.li' }, | |
{ :name => 'TweetMeme', :match => 'TweetmemeBot', :url => 'http://tweetmeme.com' }, | |
{ :name => 'Facebook', :match => 'facebookexternalhit', :url => 'http://www.facebook.com' }, | |
{ :name => 'LinkedIn', :match => 'LinkedInBot', :url => 'http://linkedin.com' }, | |
{ :name => 'Topsy', :match => 'Butterfly', :url => 'http://topsy.com' }, | |
{ :name => 'Summify', :match => 'Summify', :url => 'http://summify.com' }, | |
{ :name => 'Twitterbot', :match => 'Twitterbot', :url => 'http://twitter.com' }, | |
{ :name => 'Echo', :match => 'JS-Kit', :url => 'http://aboutecho.com' }, | |
{ :name => 'Yandex', :match => 'YandexBot', :url => 'http://yandex.com' }, | |
{ :name => 'MetaURI', :match => 'MetaURI', :url => 'http://metauri.com'}, | |
{ :name => 'Bitly', :match => 'bitlybot ', :url => 'http://bitly.com' }, | |
{ :name => 'TweetedTimes', :match => 'TweetedTimes', :url => 'http://tweetedtimes.com' }, | |
{ :name => 'Google', :match => 'Googlebot', :url => 'http://google.com' }, | |
{ :name => 'Evri', :match => 'Evrinid', :url => 'http://evri.com' }, | |
{ :name => 'LongURL', :match => 'LongURL', :url => 'http://longurl.org' }, | |
{ :name => 'Topixtream', :match => 'Castabot', :url => 'http://topixtream.com' }, | |
{ :name => 'urlresolver', :match => 'urlresolver', :url => 'http://ant.apache.org/ivy/history/latest-milestone/resolver/url.html' }, | |
{ :name => 'cURL', :match => 'curl', :url => 'http://curl.haxx.se' }, | |
{ :name => 'NineConnections.com', :match => 'Kimengi', :url => 'http://NineConnections.com'}, | |
{ :name => 'Status.net (Laconica)', :match => 'Laconica', :url => 'http://status.net/' }, | |
{ :name => 'Showyou.com', :match => 'Showyoubot', :url => 'http://showyou.com' }, | |
{ :name => 'PostPost', :match => 'PostPost', :url => 'http://postpost.com' }, | |
{ :name => 'OGS Critter', :match => 'OGS Critter', :url => 'http://google.com/?OGS+Critter' }, | |
{ :name => 'postrank', :match => 'PostRank', :url => 'http://postrank.com' }, | |
{ :name => 'percbotspider', :match => 'percobotspider', :url => 'http://google.com/?percobotspider'}, | |
{ :name => 'Equentia', :match => 'EQUENTIA-BOT', :url => 'http://eqentia.com' }, | |
{ :name => 'yolinkBot', :match => 'yolinkBot', :url => ''}, | |
{ :name => 'NING', :match => 'NING', :url => ''}, | |
{ :name => 'Instapaper', :match => 'instapaper', :url => ''}, | |
{ :name => 'Blekkobot', :match => 'blekko', :url => 'http://blekko.com/about/blekkobot'}, | |
{ :name => 'Baiduspider', :match => 'Baiduspider', :url => 'http://www.baidu.com/search/spider.html'}, | |
{ :name => 'Ruby', :match => 'Ruby', :url => ''}, | |
{ :name => 'Diffbot', :match => 'diffbot', :url => 'http://diffbot.com'}, | |
{ :name => 'FriendFeedBot', :match => 'friendfeed', :url => 'http://friendfeed.com/about/bot'}, | |
{ :name => 'Vancouver', :match => 'Vancouver', :url => ''}, | |
{ :name => 'ScribdReader', :match => 'ScribdReader', :url => 'http://www.float.com'}, | |
{ :name => 'HiveAnalyzer', :match => 'HiveAnalyzer', :url => 'http://www.businessinsider.com'}, | |
{ :name => 'Yeti', :match => 'Yeti', :url => 'http://help.naver.com/robots/'}, | |
{ :name => 'MFE_expand', :match => 'MFE_expand', :url => ''}, | |
{ :name => 'intigi', :match => 'intigi', :url => 'http://intigi.com'}, | |
{ :name => 'Trove', :match => 'Trove', :url => ''}, | |
{ :name => 'EQENTIA-BOT', :match => 'EQENTIA-BOT', :url => ''}, | |
{ :name => 'JS-Kit', :match => 'js-kit', :url => 'http://js-kit.com/'}, | |
{ :name => 'spider', :match => 'spider', :url => ''}, | |
{ :name => 'bingbot', :match => 'bingbot', :url => 'http://www.bing.com/bingbot.htm'}, | |
{ :name => 'coccoc', :match => 'coccoc', :url => ''} | |
] | |
def self.is_bot?(useragent) | |
return unless useragent.present? | |
BOTS.each do |b| | |
return b if useragent.index(b[:match]) | |
end | |
false | |
end | |
def self.is_facebook_bot?(useragent) | |
useragent.to_s.index('facebookexternalhit').present? | |
end | |
def self.is_linkedin_bot?(useragent) | |
useragent.to_s.index('LinkedInBot').present? | |
end | |
def self.is_gplus_bot?(useragent) | |
useragent.to_s.index('Gecko/20110814 Firefox/6.0').present? | |
end | |
def self.needs_cloaking?(useragent) | |
self.is_facebook_bot?(useragent) || self.is_linkedin_bot?(useragent) || self.is_gplus_bot?(useragent) | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment