Skip to content

Instantly share code, notes, and snippets.

@adamloving
Created April 1, 2013 21:15
Show Gist options
  • Save adamloving/5287815 to your computer and use it in GitHub Desktop.
Save adamloving/5287815 to your computer and use it in GitHub Desktop.
Search strings for common user agents to be filtered out of web page view statistics.
# h = {}; Event.all.each { |e| h[e.useragent] = (h[e.useragent] || 0) + 1; }
# h.keys.each { |k| puts "#{h[k]},#{k}"
class UserAgent
# gplus => "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0"
BOTS = [
{ :name => 'AppEngine', :match => 'AppEngine', :url => 'http://code.google.com/appengine'},
{ :name => 'Embedly', :match => 'Embedly', :url => 'http://support.embed.ly/'},
{ :name => 'news.me', :match => 'news.me', :url => ''},
{ :name => 'Voyager', :match => 'Voyager/1.0', :url => ''},
{ :name => 'bitlybot', :match => 'bitlybot', :url => ''},
{ :name => 'InAGist', :match => 'InAGist', :url => 'http://inagist.com'},
{ :name => 'Crowsnest', :match => 'Crowsnest', :url => 'http://www.crowsnest.tv'},
{ :name => 'Apache-HttpClient', :match => 'Apache-HttpClient', :url => ''},
{ :name => 'RockMeltEmbedService', :match => 'RockMeltEmbedService', :url => ''},
{ :name => 'ShowyouBot', :match => 'ShowyouBot', :url => 'http://showyou.com/crawler'},
{ :name => 'Jakarta Commons', :match => 'Jakarta Commons', :url => ''},
{ :name => 'Java', :match => 'Java', :url => ''},
{ :name => 'CURL', :match => 'CURL', :url => ''},
{ :name => 'PycURL', :match => 'PycURL', :url => ''},
{ :name => 'percbotspider', :match => 'percbotspider', :url => ''},
{ :name => 'strawberryj.am', :match => 'strawberryj.am', :url => 'http://strawberryj.am'},
{ :name => 'Flipboard', :match => 'FlipboardProxy', :url => 'http://flipboard.com/browserproxy'},
{ :name => 'Google Feedfetcher', :match => 'Feedfetcher', :url => 'http://www.google.com/feedfetcher.html'},
{ :name => 'Google Plus', :match => '20110814', :url => 'http://plus.google.com'},
{ :name => 'EventMachine', :match => 'EventMachine', :url => 'http://rubyeventmachine.com/' },
{ :name => 'Python urllib', :match => 'urllib', :url => 'http://docs.python.org/library/urllib.html' },
{ :name => 'Gnip', :match => 'gnip.com', :url => 'http://gnip.com' },
{ :name => 'Paper.li', :match => 'PaperLiBot', :url => 'http://paper.li' },
{ :name => 'TweetMeme', :match => 'TweetmemeBot', :url => 'http://tweetmeme.com' },
{ :name => 'Facebook', :match => 'facebookexternalhit', :url => 'http://www.facebook.com' },
{ :name => 'LinkedIn', :match => 'LinkedInBot', :url => 'http://linkedin.com' },
{ :name => 'Topsy', :match => 'Butterfly', :url => 'http://topsy.com' },
{ :name => 'Summify', :match => 'Summify', :url => 'http://summify.com' },
{ :name => 'Twitterbot', :match => 'Twitterbot', :url => 'http://twitter.com' },
{ :name => 'Echo', :match => 'JS-Kit', :url => 'http://aboutecho.com' },
{ :name => 'Yandex', :match => 'YandexBot', :url => 'http://yandex.com' },
{ :name => 'MetaURI', :match => 'MetaURI', :url => 'http://metauri.com'},
{ :name => 'Bitly', :match => 'bitlybot ', :url => 'http://bitly.com' },
{ :name => 'TweetedTimes', :match => 'TweetedTimes', :url => 'http://tweetedtimes.com' },
{ :name => 'Google', :match => 'Googlebot', :url => 'http://google.com' },
{ :name => 'Evri', :match => 'Evrinid', :url => 'http://evri.com' },
{ :name => 'LongURL', :match => 'LongURL', :url => 'http://longurl.org' },
{ :name => 'Topixtream', :match => 'Castabot', :url => 'http://topixtream.com' },
{ :name => 'urlresolver', :match => 'urlresolver', :url => 'http://ant.apache.org/ivy/history/latest-milestone/resolver/url.html' },
{ :name => 'cURL', :match => 'curl', :url => 'http://curl.haxx.se' },
{ :name => 'NineConnections.com', :match => 'Kimengi', :url => 'http://NineConnections.com'},
{ :name => 'Status.net (Laconica)', :match => 'Laconica', :url => 'http://status.net/' },
{ :name => 'Showyou.com', :match => 'Showyoubot', :url => 'http://showyou.com' },
{ :name => 'PostPost', :match => 'PostPost', :url => 'http://postpost.com' },
{ :name => 'OGS Critter', :match => 'OGS Critter', :url => 'http://google.com/?OGS+Critter' },
{ :name => 'postrank', :match => 'PostRank', :url => 'http://postrank.com' },
{ :name => 'percbotspider', :match => 'percobotspider', :url => 'http://google.com/?percobotspider'},
{ :name => 'Equentia', :match => 'EQUENTIA-BOT', :url => 'http://eqentia.com' },
{ :name => 'yolinkBot', :match => 'yolinkBot', :url => ''},
{ :name => 'NING', :match => 'NING', :url => ''},
{ :name => 'Instapaper', :match => 'instapaper', :url => ''},
{ :name => 'Blekkobot', :match => 'blekko', :url => 'http://blekko.com/about/blekkobot'},
{ :name => 'Baiduspider', :match => 'Baiduspider', :url => 'http://www.baidu.com/search/spider.html'},
{ :name => 'Ruby', :match => 'Ruby', :url => ''},
{ :name => 'Diffbot', :match => 'diffbot', :url => 'http://diffbot.com'},
{ :name => 'FriendFeedBot', :match => 'friendfeed', :url => 'http://friendfeed.com/about/bot'},
{ :name => 'Vancouver', :match => 'Vancouver', :url => ''},
{ :name => 'ScribdReader', :match => 'ScribdReader', :url => 'http://www.float.com'},
{ :name => 'HiveAnalyzer', :match => 'HiveAnalyzer', :url => 'http://www.businessinsider.com'},
{ :name => 'Yeti', :match => 'Yeti', :url => 'http://help.naver.com/robots/'},
{ :name => 'MFE_expand', :match => 'MFE_expand', :url => ''},
{ :name => 'intigi', :match => 'intigi', :url => 'http://intigi.com'},
{ :name => 'Trove', :match => 'Trove', :url => ''},
{ :name => 'EQENTIA-BOT', :match => 'EQENTIA-BOT', :url => ''},
{ :name => 'JS-Kit', :match => 'js-kit', :url => 'http://js-kit.com/'},
{ :name => 'spider', :match => 'spider', :url => ''},
{ :name => 'bingbot', :match => 'bingbot', :url => 'http://www.bing.com/bingbot.htm'},
{ :name => 'coccoc', :match => 'coccoc', :url => ''}
]
def self.is_bot?(useragent)
return unless useragent.present?
BOTS.each do |b|
return b if useragent.index(b[:match])
end
false
end
def self.is_facebook_bot?(useragent)
useragent.to_s.index('facebookexternalhit').present?
end
def self.is_linkedin_bot?(useragent)
useragent.to_s.index('LinkedInBot').present?
end
def self.is_gplus_bot?(useragent)
useragent.to_s.index('Gecko/20110814 Firefox/6.0').present?
end
def self.needs_cloaking?(useragent)
self.is_facebook_bot?(useragent) || self.is_linkedin_bot?(useragent) || self.is_gplus_bot?(useragent)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment