Skip to content

Instantly share code, notes, and snippets.

@dimianstudio
Created February 25, 2015 08:40
Show Gist options
  • Save dimianstudio/f9722026ec1e49fdb82a to your computer and use it in GitHub Desktop.
Save dimianstudio/f9722026ec1e49fdb82a to your computer and use it in GitHub Desktop.
module Fetcher
module Base
class Processor
attr_reader :source
def initialize(source)
@source = source
@problems = []
end
def rss_sources
fail NotImplementedError, 'rss_sources is not implemented'
end
def process
Array(rss_sources).each do |rss_source|
rss_contents = read_rss(rss_source)
next unless rss_contents
Nokogiri::XML(rss_contents).search('item').each_with_index do |item, i|
item = source.item_class.new(item, i)
next if item.disabled?
create_article(item)
end
end
notify_admin_about_errors!
end
private
def notify_admin_about_errors!
return if @problems.empty?
NotificationMailer.problems_while_fetch(@problems).deliver_now
end
def read_rss(rss_source)
open(rss_source)
rescue => e
raise e if Rails.env.development?
@problems << OpenStruct.new(e: e, type: :fetch_xml_source, metadata: rss_source)
nil
end
def create_article(item)
Article.create!(
name: item.name,
url: item.url,
body: item.body,
image_url: item.image_url,
published_at: item.published_at,
source_id: source.id
)
rescue => e
raise e if Rails.env.development?
@problems << OpenStruct.new(e: e, type: :create_article, metadata: item)
end
end
class Item < Struct.new(:item, :index)
def url
@url ||= item.search('link').first.content
end
def name
@name ||= item.search('title').first.content
end
def image_url
@image_url ||= item.search('enclosure').first.attributes['url'].value rescue nil
end
def published_at
@published_at ||= Time.parse(item.search('pubDate').first.content)
end
def images
@images ||= page.images.select do |image|
(FastImage.size(image.url.to_s) || []).first.to_i > 400
end.map(&:to_s)
end
def body
return @body if @body.present?
@body = extract_body_from_page(page)
return unless @body
['script', 'style', 'iframe'].concat(elements_for_removing).each do |element|
@body.search(element).remove
end
@body = cleanup_body(@body)
@body.strip
end
def disabled?
Article.exists?(url: url)
end
def elements_with_article_body
fail NotImplementedError, 'elements_with_article_body is not implemented'
end
def elements_for_removing
[]
end
def cleanup_body(body)
@body.text
.strip.gsub(/\n\s+|\r\n\s+/, "\r\n") # cleanup
.split(/\r\n/).select(&:present?).map(&:strip) # split by lines and clean each line
.join("\r\n") # join by \r\n
end
private
def page
@page ||= Mechanize.new.get(url)
end
def extract_body_from_page(page)
return unless page
elements_with_article_body.each do |element|
body = page.at(element)
return body if body
end
nil
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment