dimianstudio · February 25, 2015 08:40
diff --git a/base_fetcher.rb b/base_fetcher.rb
 module Fetcher
  module Base
    class Processor
      attr_reader :source

      def initialize(source)
        @source = source
        @problems = []
      end

      def rss_sources
        fail NotImplementedError, 'rss_sources is not implemented'
      end

      def process
        Array(rss_sources).each do |rss_source|
          rss_contents = read_rss(rss_source)
          next unless rss_contents

          Nokogiri::XML(rss_contents).search('item').each_with_index do |item, i|
            item = source.item_class.new(item, i)
            next if item.disabled?
            create_article(item)
          end
        end

        notify_admin_about_errors!
      end

      private

      def notify_admin_about_errors!
        return if @problems.empty?
        NotificationMailer.problems_while_fetch(@problems).deliver_now
      end

      def read_rss(rss_source)
        open(rss_source)
      rescue => e
        raise e if Rails.env.development?
        @problems << OpenStruct.new(e: e, type: :fetch_xml_source, metadata: rss_source)
        nil
      end

      def create_article(item)
        Article.create!(
          name: item.name,
          url: item.url,
          body: item.body,
          image_url: item.image_url,
          published_at: item.published_at,
          source_id: source.id
        )
      rescue => e
        raise e if Rails.env.development?
        @problems << OpenStruct.new(e: e, type: :create_article, metadata: item)
      end
    end

    class Item < Struct.new(:item, :index)
      def url
        @url ||= item.search('link').first.content
      end

      def name
        @name ||= item.search('title').first.content
      end

      def image_url
        @image_url ||= item.search('enclosure').first.attributes['url'].value rescue nil
      end

      def published_at
        @published_at ||= Time.parse(item.search('pubDate').first.content)
      end

      def images
        @images ||= page.images.select do |image|
          (FastImage.size(image.url.to_s) || []).first.to_i > 400
        end.map(&:to_s)
      end

      def body
        return @body if @body.present?

        @body = extract_body_from_page(page)
        return unless @body

        ['script', 'style', 'iframe'].concat(elements_for_removing).each do |element|
          @body.search(element).remove
        end

        @body = cleanup_body(@body)
        @body.strip
      end

      def disabled?
        Article.exists?(url: url)
      end

      def elements_with_article_body
        fail NotImplementedError, 'elements_with_article_body is not implemented'
      end

      def elements_for_removing
        []
      end

      def cleanup_body(body)
        @body.text
          .strip.gsub(/\n\s+|\r\n\s+/, "\r\n")            # cleanup
          .split(/\r\n/).select(&:present?).map(&:strip)  # split by lines and clean each line
          .join("\r\n")                                   # join by \r\n
      end

      private

      def page
        @page ||= Mechanize.new.get(url)
      end

      def extract_body_from_page(page)
        return unless page

        elements_with_article_body.each do |element|
          body = page.at(element)
          return body if body
        end

        nil
      end
    end
  end
 end
	module Fetcher
	module Base
	class Processor
	attr_reader :source

	def initialize(source)
	@source = source
	@problems = []
	end

	def rss_sources
	fail NotImplementedError, 'rss_sources is not implemented'
	end

	def process
	Array(rss_sources).each do \|rss_source\|
	rss_contents = read_rss(rss_source)
	next unless rss_contents

	Nokogiri::XML(rss_contents).search('item').each_with_index do \|item, i\|
	item = source.item_class.new(item, i)
	next if item.disabled?
	create_article(item)
	end
	end

	notify_admin_about_errors!
	end

	private

	def notify_admin_about_errors!
	return if @problems.empty?
	NotificationMailer.problems_while_fetch(@problems).deliver_now
	end

	def read_rss(rss_source)
	open(rss_source)
	rescue => e
	raise e if Rails.env.development?
	@problems << OpenStruct.new(e: e, type: :fetch_xml_source, metadata: rss_source)
	nil
	end

	def create_article(item)
	Article.create!(
	name: item.name,
	url: item.url,
	body: item.body,
	image_url: item.image_url,
	published_at: item.published_at,
	source_id: source.id
	)
	rescue => e
	raise e if Rails.env.development?
	@problems << OpenStruct.new(e: e, type: :create_article, metadata: item)
	end
	end

	class Item < Struct.new(:item, :index)
	def url
	@url \|\|= item.search('link').first.content
	end

	def name
	@name \|\|= item.search('title').first.content
	end

	def image_url
	@image_url \|\|= item.search('enclosure').first.attributes['url'].value rescue nil
	end

	def published_at
	@published_at \|\|= Time.parse(item.search('pubDate').first.content)
	end

	def images
	@images \|\|= page.images.select do \|image\|
	(FastImage.size(image.url.to_s) \|\| []).first.to_i > 400
	end.map(&:to_s)
	end

	def body
	return @body if @body.present?

	@body = extract_body_from_page(page)
	return unless @body

	['script', 'style', 'iframe'].concat(elements_for_removing).each do \|element\|
	@body.search(element).remove
	end

	@body = cleanup_body(@body)
	@body.strip
	end

	def disabled?
	Article.exists?(url: url)
	end

	def elements_with_article_body
	fail NotImplementedError, 'elements_with_article_body is not implemented'
	end

	def elements_for_removing
	[]
	end

	def cleanup_body(body)
	@body.text
	.strip.gsub(/\n\s+\|\r\n\s+/, "\r\n") # cleanup
	.split(/\r\n/).select(&:present?).map(&:strip) # split by lines and clean each line
	.join("\r\n") # join by \r\n
	end

	private

	def page
	@page \|\|= Mechanize.new.get(url)
	end

	def extract_body_from_page(page)
	return unless page

	elements_with_article_body.each do \|element\|
	body = page.at(element)
	return body if body
	end

	nil
	end
	end
	end
	end