Last active
May 30, 2017 16:17
-
-
Save jaimeiniesta/8201769a68549c94434fb20d6d48ceee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A basic spider that will follow internal links, checking broken links | |
# | |
# Usage example: | |
# | |
# ruby link_checker.rb example.com | |
# puts "Using MetaInspector #{MetaInspector::VERSION}" | |
require 'open-uri' | |
require 'metainspector' | |
class LinkChecker | |
def initialize(url) | |
@url = url | |
@queue = [] | |
@visited = [] | |
@ok = [] | |
@broken = {} | |
check | |
end | |
def report | |
puts "\n#{@broken.size} broken links found." | |
@broken.each do |link, from| | |
puts "\n#{link} linked from" | |
from.each do |origin| | |
puts " - #{origin}" | |
end | |
end | |
end | |
private | |
def check | |
# Resolves redirections of initial URL before placing it on the queue | |
@queue.push(MetaInspector.new(@url).url) | |
process_next_on_queue while @queue.any? | |
end | |
def process_next_on_queue | |
page = MetaInspector.new(@queue.pop) | |
page.links.external.select {|l| l =~ /^http(s)?:\/\//i}.each do |link| | |
check_status(link, page.url) | |
end | |
@visited.push(page.url) | |
# require 'pry' | |
# binding.pry | |
page.links.external.each do |link| | |
@queue.push(link) if should_be_enqueued?(link) | |
end | |
show_stats | |
end | |
# Checks the response status of the linked_url and stores it on the ok or broken collections | |
def check_status(linked_url, from_url) | |
if @broken.keys.include?(linked_url) | |
# This was already known to be broken, we add another origin | |
@broken[linked_url] << from_url | |
else | |
if [email protected]?(linked_url) | |
# We still don't know about this link status, so we check it now | |
if reachable?(linked_url) | |
@ok << linked_url | |
else | |
@broken[linked_url] = [from_url] | |
end | |
end | |
end | |
end | |
def should_be_enqueued?(url) | |
!(@visited.include?(url) || @broken.include?(url) || @queue.include?(url)) | |
end | |
def show_stats | |
puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links" | |
end | |
# A page is reachable if its response status is less than 400 | |
# In the case of exceptions, like timeouts or server connection errors, | |
# we consider it unreachable | |
def reachable?(url) | |
page = MetaInspector.new(url) | |
if page.response.status < 400 | |
true | |
else | |
false | |
end | |
rescue | |
false | |
end | |
end | |
# Get the starting URL | |
url = ARGV[0] || (puts "Enter a starting url"; gets.strip) | |
LinkChecker.new(url).report | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment