Skip to content

Instantly share code, notes, and snippets.

@stevenkaras
Created May 29, 2014 11:45
Show Gist options
  • Save stevenkaras/a07a99bc66b7becf51b2 to your computer and use it in GitHub Desktop.
Save stevenkaras/a07a99bc66b7becf51b2 to your computer and use it in GitHub Desktop.
robots.rb
# Copyright Steven Karas 2013 - 2014
# Licensed under MIT terms
# Represents a robots.txt file
class Robots
# Parse the given content as a robots.txt file
#
# @see http://www.robotstxt.org/
def initialize(content)
@sitemaps = []
@bots = {
"*" => {
delay: nil,
disallowed: [],
allowed: [],
}
}
agents = []
content.split("\n").each do |line|
begin
case line
when /User-agent:\s*(.*)$/i
agents << $~[1]
agents.each do |agent|
@bots[agent] ||= {
delay: nil,
disallowed: [],
allowed: [],
}
end
when ""
agents = []
when /Allow:\s*(.*)$/i
allowed_path = $~[1]
if allowed_path == ""
agents.each do |agent|
@bots[agent][:disallowed] << "*"
end
else
agents.each do |agent|
@bots[agent][:allowed] << allowed_path
end
end
when /Disallow:\s*(.*)$/i
disallowed_path = $~[1]
if disallowed_path == ""
agents.each do |agent|
@bots[agent][:allowed] << "*"
end
else
agents.each do |agent|
@bots[agent][:disallowed] << disallowed_path
end
end
when /Crawl-Delay:\s*(.*)/i
agents.each do |agent|
@bots[agent][:delay] = $~[1].to_i
end
when /Sitemap:\s*(.*)/i
@sitemaps << $~[1]
end
rescue
end
end
end
def sitemaps
return @sitemaps
end
def bots
return @bots
end
# Get a hash of directives that apply to your bot
def bot(botname)
directive_list = @bots.map do |name, directives|
[name_score(botname, name), directives]
end.sort_by(&:first).select do |list|
list.first > 0
end.map(&:last)
gather_directives(directive_list)
end
def raw_bot(botname)
return @bots[botname]
end
# Check if a particular URL is allowed for your crawler
#
# NOTE: this is a political function, meaning it has arbitrary behavior and correctness
#
# @param directives [Hash] the directives as retrieved by #bot
# @param url [String] the url to check against
# @return [Boolean] whether the given url is allowed to be crawled based on the provided directives
def allowed(directives, url, strategy = :default)
case strategy
when :default
result = true
directives[:disallowed].each do |pattern|
if url.start_with?(pattern)
result = false
end
end
directives[:allowed].each do |pattern|
if url.start_with?(pattern)
result = true
end
end
return result
when :google
#TODO
true
when :bing
#TODO
true
end
end
private
def gather_directives(directive_list)
result = {
delay: @bots["*"][:delay] || 10,
disallowed: [],
allowed: [],
}
directive_list.each do |directives|
result[:delay] = directives[:delay] || result[:delay]
result[:disallowed] += directives[:disallowed]
result[:allowed] += directives[:allowed]
end
return result
end
def name_score(botname, pattern)
if botname == pattern
return botname.length + 1
elsif botname.include? pattern
return pattern.length
else
return 0
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment