XORwell · December 22, 2015 09:48
diff --git a/feed_discoverer.rb b/feed_discoverer.rb
 class FeedDiscoverer
  require 'open-uri'

  # Constructor accepts options
  # @param [Hash] opts
  # Available options:
  # * unique (default: true) - filter duplicate urls
  def initialize(opts={})
    @opts = {:unique => true}.merge!(opts)
  end

  # Get HTML and discover links
  # @param [String] url
  # @return [Array] links
  # @todo case feed-aggregators.. detect if given url is an feed (starts with <?xml) then return url, otherwise (website) call discover
  def fetch_and_discover(url)
    html = fetch(url)
    links = discover(html, url)
    return links
  end

  # Get HTML from URL
  # @param [String] url
  # @return [String] html
  def fetch(url)
    html = open(url).read
  end

  # Search HTML for link-tags typeof feed
  # @param [String] html
  # @param [String] url (optional) for relative to absolute url
  # @return [Array] urls
  def discover(html, url=nil)
    urls = []
    link_regex = /(                      # capture whole match
                  <link                  # begin link tag
                  (?:\s+|\s+[^>]+\s+)    # one or more spaces, possibly with more stuff inside
                  type=['"]              # begin type attribute with leading quote
                  application\/(?:atom|rss)\+xml      # RSS MIME type
                  ['"][^>]*              # trailing quote, possibly some more stuff
                  >                      # end tag
                )/ix                     # let regex ignore case, whitespace and comments

    href_url_regex = /
                      (?<=href=['"])  # if preceded by href= and a quote
                      .+?             # as little of something as possible
                      (?=['"])        # if followed by another quote
                    /ix               # ignore case, whitespace and comments


    links = html.scan(link_regex).flatten
    links.each do |link|
      if match = link.match(href_url_regex)
        #if url is relative and the param 'url' was given, make it absolute
        url_found = (URI(match.to_s).host.nil? && url) ? URI.join(url, match.to_s).to_s : match.to_s
        urls << url_found
      end
    end

    return  (@opts[:unique])? urls.uniq : urls
  end

 end
	class FeedDiscoverer
	require 'open-uri'

	# Constructor accepts options
	# @param [Hash] opts
	# Available options:
	# * unique (default: true) - filter duplicate urls
	def initialize(opts={})
	@opts = {:unique => true}.merge!(opts)
	end

	# Get HTML and discover links
	# @param [String] url
	# @return [Array] links
	# @todo case feed-aggregators.. detect if given url is an feed (starts with <?xml) then return url, otherwise (website) call discover
	def fetch_and_discover(url)
	html = fetch(url)
	links = discover(html, url)
	return links
	end

	# Get HTML from URL
	# @param [String] url
	# @return [String] html
	def fetch(url)
	html = open(url).read
	end

	# Search HTML for link-tags typeof feed
	# @param [String] html
	# @param [String] url (optional) for relative to absolute url
	# @return [Array] urls
	def discover(html, url=nil)
	urls = []
	link_regex = /( # capture whole match
	<link # begin link tag
	(?:\s+\|\s+[^>]+\s+) # one or more spaces, possibly with more stuff inside
	type=['"] # begin type attribute with leading quote
	application\/(?:atom\|rss)\+xml # RSS MIME type
	['"][^>]* # trailing quote, possibly some more stuff
	> # end tag
	)/ix # let regex ignore case, whitespace and comments

	href_url_regex = /
	(?<=href=['"]) # if preceded by href= and a quote
	.+? # as little of something as possible
	(?=['"]) # if followed by another quote
	/ix # ignore case, whitespace and comments


	links = html.scan(link_regex).flatten
	links.each do \|link\|
	if match = link.match(href_url_regex)
	#if url is relative and the param 'url' was given, make it absolute
	url_found = (URI(match.to_s).host.nil? && url) ? URI.join(url, match.to_s).to_s : match.to_s
	urls << url_found
	end
	end

	return (@opts[:unique])? urls.uniq : urls
	end

	end