Created
February 25, 2015 08:40
-
-
Save dimianstudio/f9722026ec1e49fdb82a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Fetcher | |
module Base | |
class Processor | |
attr_reader :source | |
def initialize(source) | |
@source = source | |
@problems = [] | |
end | |
def rss_sources | |
fail NotImplementedError, 'rss_sources is not implemented' | |
end | |
def process | |
Array(rss_sources).each do |rss_source| | |
rss_contents = read_rss(rss_source) | |
next unless rss_contents | |
Nokogiri::XML(rss_contents).search('item').each_with_index do |item, i| | |
item = source.item_class.new(item, i) | |
next if item.disabled? | |
create_article(item) | |
end | |
end | |
notify_admin_about_errors! | |
end | |
private | |
def notify_admin_about_errors! | |
return if @problems.empty? | |
NotificationMailer.problems_while_fetch(@problems).deliver_now | |
end | |
def read_rss(rss_source) | |
open(rss_source) | |
rescue => e | |
raise e if Rails.env.development? | |
@problems << OpenStruct.new(e: e, type: :fetch_xml_source, metadata: rss_source) | |
nil | |
end | |
def create_article(item) | |
Article.create!( | |
name: item.name, | |
url: item.url, | |
body: item.body, | |
image_url: item.image_url, | |
published_at: item.published_at, | |
source_id: source.id | |
) | |
rescue => e | |
raise e if Rails.env.development? | |
@problems << OpenStruct.new(e: e, type: :create_article, metadata: item) | |
end | |
end | |
class Item < Struct.new(:item, :index) | |
def url | |
@url ||= item.search('link').first.content | |
end | |
def name | |
@name ||= item.search('title').first.content | |
end | |
def image_url | |
@image_url ||= item.search('enclosure').first.attributes['url'].value rescue nil | |
end | |
def published_at | |
@published_at ||= Time.parse(item.search('pubDate').first.content) | |
end | |
def images | |
@images ||= page.images.select do |image| | |
(FastImage.size(image.url.to_s) || []).first.to_i > 400 | |
end.map(&:to_s) | |
end | |
def body | |
return @body if @body.present? | |
@body = extract_body_from_page(page) | |
return unless @body | |
['script', 'style', 'iframe'].concat(elements_for_removing).each do |element| | |
@body.search(element).remove | |
end | |
@body = cleanup_body(@body) | |
@body.strip | |
end | |
def disabled? | |
Article.exists?(url: url) | |
end | |
def elements_with_article_body | |
fail NotImplementedError, 'elements_with_article_body is not implemented' | |
end | |
def elements_for_removing | |
[] | |
end | |
def cleanup_body(body) | |
@body.text | |
.strip.gsub(/\n\s+|\r\n\s+/, "\r\n") # cleanup | |
.split(/\r\n/).select(&:present?).map(&:strip) # split by lines and clean each line | |
.join("\r\n") # join by \r\n | |
end | |
private | |
def page | |
@page ||= Mechanize.new.get(url) | |
end | |
def extract_body_from_page(page) | |
return unless page | |
elements_with_article_body.each do |element| | |
body = page.at(element) | |
return body if body | |
end | |
nil | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment