Skip to content

Instantly share code, notes, and snippets.

@manveru
Created December 28, 2008 06:27
Show Gist options
  • Save manveru/40393 to your computer and use it in GitHub Desktop.
Save manveru/40393 to your computer and use it in GitHub Desktop.
%w[hpricot json open-uri].each{|lib| require(lib) }
class Feed
def self.parse(uri)
doc = Hpricot.XML(open(uri))
feed = new(uri)
if doc.at(:item)
feed.parse_rss2(doc)
elsif doc.at(:entry)
feed.parse_atom(doc)
else
raise ArgumentError, "Cannot parse this"
end
feed
end
attr_reader :meta, :childs
def initialize(uri)
@childs = []
@meta = Meta.new(self)
end
def parse_atom(doc)
parse_meta(doc, :feed)
parse_common(doc, :entry, Entry)
end
def parse_rss2(doc)
parse_meta(doc, 'rss/channel')
parse_common(doc, :item, Item)
end
def parse_common(doc, selector, klass)
(doc/selector).each do |node|
@childs << obj = klass.new(self)
node.children.each do |child|
next unless child.respond_to?(:name)
obj[child.name] = child
end
end
end
def parse_meta(doc, selector)
(doc/selector).each do |node|
node.children.each do |child|
next unless child.respond_to?(:name)
next if child.name == 'entry' || child.name == 'item'
@meta[child.name] = child
end
end
end
class Child
HANDLE_TIME = lambda{|time| Time.parse(time.inner_text.strip) }
HANDLE_LINK = lambda{|link| link[:href] }
HANDLE_AUTHOR = lambda{|author|
hold = {}
author.children.each do |child|
next unless child.respond_to?(:name)
hold[child.name] = child.inner_text.strip
end
hold
}
attr_reader :parent, :list
def initialize(parent)
@parent = parent
@list = {}
end
def []=(key, value)
handler = self.class::HANDLE[key]
@list[key.to_s] = handler ? handler.call(value) : value.inner_text.strip
end
def [](key)
@list[key.to_s]
end
end
class Item < Child
HANDLE = { 'pubDate' => HANDLE_TIME }
end
class Entry < Child
HANDLE = { 'link' => HANDLE_LINK, 'author' => HANDLE_AUTHOR,
'updated' => HANDLE_TIME, 'published' => HANDLE_TIME }
end
class Meta < Child
HANDLE = { 'link' => HANDLE_LINK, 'author' => HANDLE_AUTHOR,
'updated' => HANDLE_TIME }
end
end
__END__
require 'pp'
uri = '/home/manveru/websites/anarchaia.org/index.atom'
feed = Feed.parse(uri)
pp feed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment