Last active
April 8, 2023 04:39
-
-
Save afcapel/758ad314ca97067de43f67a35fc70aa0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "open-uri" | |
module Imports | |
class PageCrawler | |
attr_reader :current_page, :pages, :crawled | |
attr_accessor :follow_patterns, :ignore_patterns | |
def initialize(start_url, max_depth: 3, max_pages: 1000) | |
@max_depth, @max_pages = max_depth, max_pages | |
@start_url = URI.parse(start_url) | |
@follow_patterns = [] | |
@ignore_patterns = [/mailto/] | |
@crawled = 0 | |
@pages = [] | |
@pages << CrawledPage.new(start_url, 0) | |
end | |
def crawl(&block) | |
while @current_page = @pages.find(&:pending?) do | |
@current_page.crawl(&block) | |
@crawled += 1 | |
add_next_pages | |
end | |
end | |
private | |
def add_next_pages | |
return unless @current_page.depth < @max_depth && @pages.size < @max_pages | |
links = @current_page.links.collect { |link| normalize(link) }.compact | |
links = links.select { |link| should_follow?(link) } | |
@pages += links.collect { |link| CrawledPage.new(link, @current_page.depth + 1) } | |
@pages = @pages.uniq(&:url).take(@max_pages) | |
end | |
def normalize(link) | |
uri = URI.parse(link) | |
uri = @start_url.merge(uri) if uri.relative? | |
return uri | |
rescue URI::InvalidURIError => ex | |
Rails.logger.error(ex.message) | |
nil | |
end | |
def should_follow?(link) | |
return false unless @start_url.host == link.host | |
return false unless follow_patterns.empty? || follow_patterns.find { |p| p =~ link.path } | |
return false if ignore_patterns.find { |p| p =~ link.path } | |
true | |
end | |
end | |
class CrawledPage | |
attr_reader :url, :html, :doc, :depth | |
def initialize(url, depth) | |
@url, @depth = url, depth | |
@pending = true | |
end | |
def crawl(&block) | |
begin | |
@html = URI.open(@url.to_s).read | |
block.call(@url, @html) | |
rescue OpenURI::HTTPError => ex | |
Rails.logger.error(ex.message) | |
ensure | |
@pending = false | |
end | |
end | |
def links | |
@doc ||= Nokogiri::HTML(@html) | |
@links ||= @doc.xpath("//a").collect { |a| a["href"] }.reject(&:blank?) | |
end | |
def pending? | |
@pending | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment