Created
May 29, 2014 11:45
-
-
Save stevenkaras/a07a99bc66b7becf51b2 to your computer and use it in GitHub Desktop.
robots.rb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright Steven Karas 2013 - 2014 | |
# Licensed under MIT terms | |
# Represents a robots.txt file | |
class Robots | |
# Parse the given content as a robots.txt file | |
# | |
# @see http://www.robotstxt.org/ | |
def initialize(content) | |
@sitemaps = [] | |
@bots = { | |
"*" => { | |
delay: nil, | |
disallowed: [], | |
allowed: [], | |
} | |
} | |
agents = [] | |
content.split("\n").each do |line| | |
begin | |
case line | |
when /User-agent:\s*(.*)$/i | |
agents << $~[1] | |
agents.each do |agent| | |
@bots[agent] ||= { | |
delay: nil, | |
disallowed: [], | |
allowed: [], | |
} | |
end | |
when "" | |
agents = [] | |
when /Allow:\s*(.*)$/i | |
allowed_path = $~[1] | |
if allowed_path == "" | |
agents.each do |agent| | |
@bots[agent][:disallowed] << "*" | |
end | |
else | |
agents.each do |agent| | |
@bots[agent][:allowed] << allowed_path | |
end | |
end | |
when /Disallow:\s*(.*)$/i | |
disallowed_path = $~[1] | |
if disallowed_path == "" | |
agents.each do |agent| | |
@bots[agent][:allowed] << "*" | |
end | |
else | |
agents.each do |agent| | |
@bots[agent][:disallowed] << disallowed_path | |
end | |
end | |
when /Crawl-Delay:\s*(.*)/i | |
agents.each do |agent| | |
@bots[agent][:delay] = $~[1].to_i | |
end | |
when /Sitemap:\s*(.*)/i | |
@sitemaps << $~[1] | |
end | |
rescue | |
end | |
end | |
end | |
def sitemaps | |
return @sitemaps | |
end | |
def bots | |
return @bots | |
end | |
# Get a hash of directives that apply to your bot | |
def bot(botname) | |
directive_list = @bots.map do |name, directives| | |
[name_score(botname, name), directives] | |
end.sort_by(&:first).select do |list| | |
list.first > 0 | |
end.map(&:last) | |
gather_directives(directive_list) | |
end | |
def raw_bot(botname) | |
return @bots[botname] | |
end | |
# Check if a particular URL is allowed for your crawler | |
# | |
# NOTE: this is a political function, meaning it has arbitrary behavior and correctness | |
# | |
# @param directives [Hash] the directives as retrieved by #bot | |
# @param url [String] the url to check against | |
# @return [Boolean] whether the given url is allowed to be crawled based on the provided directives | |
def allowed(directives, url, strategy = :default) | |
case strategy | |
when :default | |
result = true | |
directives[:disallowed].each do |pattern| | |
if url.start_with?(pattern) | |
result = false | |
end | |
end | |
directives[:allowed].each do |pattern| | |
if url.start_with?(pattern) | |
result = true | |
end | |
end | |
return result | |
when :google | |
#TODO | |
true | |
when :bing | |
#TODO | |
true | |
end | |
end | |
private | |
def gather_directives(directive_list) | |
result = { | |
delay: @bots["*"][:delay] || 10, | |
disallowed: [], | |
allowed: [], | |
} | |
directive_list.each do |directives| | |
result[:delay] = directives[:delay] || result[:delay] | |
result[:disallowed] += directives[:disallowed] | |
result[:allowed] += directives[:allowed] | |
end | |
return result | |
end | |
def name_score(botname, pattern) | |
if botname == pattern | |
return botname.length + 1 | |
elsif botname.include? pattern | |
return pattern.length | |
else | |
return 0 | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment