Created
March 23, 2009 18:17
-
-
Save danchoi/83702 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Feed < ActiveRecord::Base | |
has_many :items, :order => "published_at desc", :dependent => :delete_all | |
has_many :feed_errors | |
# This is problematic because on update, the feed_url can be changed. It can | |
# be changed to the feed_url of another feed. TODO Figure out a way to | |
# automatically merge two feeds in this case. | |
# validates_uniqueness_of :feed_url | |
validates_presence_of :feed_url | |
has_many :feed_discoveries, :dependent => :destroy | |
has_many :subscriptions, :dependent => :destroy | |
named_scope :popular, :order => 'subscriptions_count desc', :limit => 50 | |
named_scope :subscribed_to, :order => "subscriptions_count desc, feeds.title asc", | |
:conditions => "subscriptions_count > 0" | |
attr_reader :feedzirra_feed | |
alias :ff :feedzirra_feed | |
attr_reader :encoding, :xml | |
serialize :download_times | |
# Before calling this, create or initialize the feed first by setting the | |
# feed_url. This method will add or update entries as necessary. | |
def fetch_and_parse | |
@xml = fetch_raw(self.feed_url) | |
parse | |
#rescue Feedzirra::NoParserAvailable => ex | |
rescue Exception => ex | |
create_feed_error_from_exception(ex) | |
end | |
def stale? | |
self.last_downloaded_at < 1.hour.ago | |
end | |
def slow? | |
median_download_time && median_download_time > 5 | |
end | |
# deletes items and starts over | |
def start_over | |
items.delete_all | |
fetch_and_parse | |
end | |
def fetch_raw(url) | |
start_time = Time.now | |
xml = Feedzirra::Feed.fetch_raw(url) | |
calculate_download_times(start_time) | |
xml | |
end | |
def calculate_download_times(start_time) | |
if self.download_times.nil? || !self.download_times.is_a?(Array) | |
self.download_times = [] | |
end | |
self.download_times = (self.download_times << (Time.now - start_time)) | |
self.average_download_time = mean(download_times) | |
self.median_download_time = median(download_times) | |
save! | |
end | |
def parse(xml=@xml) | |
@feedzirra_feed = Feedzirra::Feed.parse xml | |
if @feedzirra_feed | |
# No errors, means we can create the Feed | |
analyze_xml_decl | |
self.update_from_feedzirra | |
end | |
rescue Feedzirra::NoParserAvailable => ex | |
create_feed_error_from_exception(ex) | |
end | |
def analyze_xml_decl | |
@encoding = nil | |
md = /<\?xml [^>]+\?>/.match @xml | |
if md | |
declaration = md[0] | |
# at this point, we have a string like | |
# <?xml version="1.0" encoding="ISO-8859-1"?> | |
md2 = /encoding=["']([^"']+)["']/i.match declaration | |
if md2 | |
this_encoding = md2[1] | |
end | |
end | |
@encoding = this_encoding ? this_encoding.upcase : nil | |
end | |
def create_feed_error_from_exception(ex) | |
if self.new_record? | |
FeedError.create :error_type => ex.class.to_s, :message => ex.message, :feed_url => self.feed_url, :trace => ex.bracktrace | |
else | |
self.feed_errors.create :error_type => ex.class.to_s, :message => ex.message, :trace => ex.backtrace | |
end | |
return false | |
end | |
# Assumes @feedzirra_feed has been obtained | |
def update_from_feedzirra | |
#ff.sanitize_entries! # for some reason, this inserts <p> tags in the | |
#titles! | |
self.update_attributes :last_downloaded_at => Time.now, | |
:title => ff.title, | |
:etag => ff.etag, | |
:web_url => ff.url, | |
:last_modified_at => ff.last_modified | |
if ff.feed_url | |
self.update_attribute :feed_url, ff.feed_url | |
end | |
asciify_title | |
if self.items.empty? | |
save! | |
create_items(ff.entries) | |
else | |
add_new_items | |
end | |
update_statistics | |
save! | |
end | |
def create_items(ff_entries) | |
ff_entries[0,20].each do |e| | |
logger.info "-" * 80 | |
logger.info "Creating item: #{e.title} at #{Time.zone.now}" | |
item = self.items.create :published_at => e.published, | |
:title => e.title, | |
:url => e.url, | |
:author => e.author, | |
:summary => e.summary, | |
:content => e.content | |
item.process_content | |
logger.info "Done processing item: #{e.title} at #{Time.zone.now}" | |
end | |
end | |
def add_new_items | |
return if ff.entries.empty? | |
newer_than_date = self.items.first.published_at || self.items.first.created_at | |
if ff.entries.first.published.to_i > newer_than_date.to_i | |
self.create_items( ff.entries.select {|e| e.published > newer_than_date} ) | |
end | |
end | |
def update_statistics | |
self.average_words_per_item = self.items.with_content.average(:word_count).to_f.round.to_i | |
# TODO posting frequency and avg words per day | |
end | |
def self.update_subscribed_feeds | |
self.subscribed_to.each do |feed| | |
if feed.last_downloaded_at > 1.hour.ago | |
puts "skipping #{feed.title}" | |
next | |
end | |
feed.fetch_and_parse | |
puts "updated #{feed.title}" | |
end | |
end | |
def asciify_title | |
self.title = self.title.strip if self.title | |
self.update_attribute :title, Iconv.conv("US-ASCII//IGNORE//TRANSLIT", 'UTF-8', self.title) | |
end | |
def self.asciify_titles | |
Feed.all.each do |x| | |
puts "asciifying #{x.title}" | |
x.asciify_title | |
puts "asciified #{x.title}" | |
puts | |
end | |
end | |
# virtual attributes | |
def title | |
read_attribute(:title) || "no title" | |
end | |
# for calculating median and mean download times | |
def mean(array) | |
array.inject(0) { |sum, x| sum += x } / array.size.to_f | |
end | |
def median(array, already_sorted=false) | |
return nil if array.empty? | |
array = array.sort unless already_sorted | |
m_pos = array.size / 2 | |
return array.size % 2 == 1 ? array[m_pos] : mean(array[m_pos-1..m_pos]) | |
end | |
end | |
ActiveRecord::Schema.define(:version => 20090226004897) do | |
create_table "feeds", :force => true do |t| | |
t.datetime "last_downloaded_at" | |
t.datetime "last_modified_at" | |
t.string "title" | |
t.string "subtitle" | |
t.string "feed_url" | |
t.string "web_url" | |
t.string "favicon_url" | |
t.integer "average_words_per_item" | |
t.integer "average_items_per_day" | |
t.integer "unparseable_entries_count", :default => 0 | |
t.integer "subscriptions_count", :default => 0 | |
t.datetime "created_at" | |
t.datetime "updated_at" | |
t.string "etag" | |
t.text "download_times" | |
t.float "average_download_time" | |
t.float "median_download_time" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment