mertonium · April 29, 2019 23:17 · julien51 · Apr 21, 2015
diff --git a/command_line_example.rb b/command_line_example.rb
 :001 > require 'feedjira'
 :002 > require './janky_parser.rb'
 :003 > feed_url = 'https://gist.githubusercontent.com/mertonium/11087612/raw/b9402604180704d41614ddb92e1f3cdd9c09b273/janky_feed.xml'
 # Add our custom parser to the top of the stack of feeds Feedjira uses.
 :004 > Feedjira::Feed.add_feed_class Feedjira::Parser::Versa::JankyPublisher

 # Fetch and parse our example feed
 :005 > feed = Feedjira::Feed.fetch_and_parse feed_url 
 :006 > feed.class
 => Feedjira::Parser::Versa::JankyPublisher # Sweet! Our parser was chosen to parse the feed.
 :007 > feed.entries.first.url
 => "http://example.com/heres-an-article"   # The url doesn't have a query string!
 :008 > feed.entries.first.published
 => 2009-09-06 16:20:00 UTC                 # The published date is the correct one
 
diff --git a/feedjira.rb b/feedjira.rb
 # Rails initializer for the Feedjira gem. It should live in
 # config/initializers/feedjira.rb
 #
 # Our app/parsers folder it auto loaded so these classes should be available here.
 [Feedjira::Parser::Versa::JankyPublisher,
 # Feedjira::Parser::Versa::AnotherPublisher,
 # Feedjira::Parser::Versa::AThirdPublisher
 ].each{ |parser| Feedjira::Feed.add_feed_class parser }
diff --git a/janky_feed.xml b/janky_feed.xml
 <?xml version="1.0" encoding="UTF-8" ?>
 <rss version="2.0">
 <channel>
   <title>A Janky RSS Feed</title>
   <description>This is an example of an RSS feed</description>
   <link>http://www.jankybutlovablepublisher.com/</link>
   <lastBuildDate>Mon, 06 Sep 2010 00:01:00 +0000 </lastBuildDate>
   <pubDate>Mon, 06 Sep 2009 16:20:00 +0000 </pubDate>
   <ttl>1800</ttl>
   
   <item>
    <title>Example entry 1</title>
    <description>Here is some text containing an interesting description.</description>
    <link>http://example.com/heres-an-article?source=rss</link>
    <guid>foo</guid>
    <dc:created>2009/09/06 00/00/00</dc:created>
    <pubDate>Mon, 06 Sep 2009 16:20:00 +0000 </pubDate>
   </item>
   
   <item>
    <title>Example entry 2</title>
    <description>Here is some text containing an interesting description.</description>
    <link>http://example.com/another-article?source=rss</link>
    <guid>bar</guid>
    <dc:created>2009/09/06 00/00/00</dc:created>
    <pubDate>Mon, 06 Sep 2009 16:22:00 +0000 </pubDate>
   </item>
   
  </channel>
 </rss>
diff --git a/janky_parser.rb b/janky_parser.rb
 module Feedjira
  module Parser
    # It's good practice to namespace your parsers, so we'll put
    # this one in the Versa namespace.
    module Versa
    
      ### Entry Parser Class ###
      # This first class is for parsing an individual <item> in the feed.
      # We define it first because our top level parser need to be able to call it.
      # By convention, this class name is the same as our top level parser
      # but with "Entry" appended.
      class JankyPublisherEntry
        include SAXMachine
        include FeedEntryUtilities

        # Declare the fields we want to parse out of the XML feed.
        element :title
        element :link, :as => :url
        element :description, :as => :summary
        element :pubDate, :as => :published
        element :guid, :as => :entry_id
        
        # We remove the query string from the url by overriding the 'url' method
        # originally defined by including FeedEntryUtilities in our class.
        # (see https://github.com/feedjira/feedjira/blob/master/lib/feedjira/feed_entry_utilities.rb)
        def url
          @url = @url.gsub(/\?.*$/,'')
        end
      end
      
      
      ### Feed Parser Class ###
      # This class is for parsing the top level feed fields.
      class JankyPublisher
        include SAXMachine
        include FeedUtilities
        
        # Define the fields we want to parse using SAX Machine declarations
        element :title
        element :link, :as => :url
        element :description
        
        # Parse all the <item>s in the feed with the class we just defined above
        elements :item, :as => :entries, :class => Versa::JankyPublisherEntry

        attr_accessor :feed_url

        # This method is required by all Feedjira parsers. To decide which
        # parser to use, Feedjira cycles through each parser it knows about
        # and passes the first 2000 characters of the feed to this method.
        # 
        # To make sure your parser is only used when it's supposed to be used,
        # test for something unique in those first 2000 characters. URLs seem
        # to be a good choice.
        #
        # This parser, for example, is looking for an occurrence of
        # '<link>https://www.jankybutlovablepublisher.com' which we should
        # only really find in the feed we are targeting.
        def self.able_to_parse?(xml)
          (/<link>http:\/\/www\.jankybutlovablepublisher\.com\// =~ xml)
        end
      end
    end
  end
 end
diff --git a/janky_parser_spec.rb b/janky_parser_spec.rb
 # Example spec for testing that your parser works as it's supposed to
 require 'feedjira'
 require './janky_parser'

 Feedjira::Feed.add_feed_class Feedjira::Parser::Versa::JankyPublisher

 describe Feedjira::Parser::Versa::JankyPublisher do
  before :each do
    @janky_feed = File.read(File.join(File.dirname(__FILE__), '/janky_feed.xml'))
  end
  
  describe '#able_to_parse?' do
    it 'should return true if the <link> tag contains "http://www.jankybutlovablepublisher.com/"' do
      Feedjira::Parser::Versa::JankyPublisher.able_to_parse?(@janky_feed).should be_true
    end
  end
  
  describe 'the parser' do
    it 'should pull out the entries properly' do
      feed = Feedjira::Feed.parse(@janky_feed)
      feed.entries.first.class.should eq(Feedjira::Parser::Versa::JankyPublisherEntry)
    end
  end
 end

 describe Feedjira::Parser::Versa::JankyPublisher do
  describe 'a parsed entry' do
    janky_feed = File.read(File.join(File.dirname(__FILE__), '/janky_feed.xml'))
    feed = Feedjira::Feed.parse(janky_feed)
    entry = feed.entries.first
    
    it 'has the correct title' do entry.title.should == "Example entry 1" end
    it 'has the correct url' do entry.url.should == "http://example.com/heres-an-article" end
    it 'has the correct entry_id' do entry.entry_id.should == "foo" end
    it 'has the correct published time' do entry.published.should == Time.parse("Mon, 06 Sep 2009 16:20:00 +0000") end
  end
 end
	:001 > require 'feedjira'
	:002 > require './janky_parser.rb'
	:003 > feed_url = 'https://gist.githubusercontent.com/mertonium/11087612/raw/b9402604180704d41614ddb92e1f3cdd9c09b273/janky_feed.xml'
	# Add our custom parser to the top of the stack of feeds Feedjira uses.
	:004 > Feedjira::Feed.add_feed_class Feedjira::Parser::Versa::JankyPublisher

	# Fetch and parse our example feed
	:005 > feed = Feedjira::Feed.fetch_and_parse feed_url
	:006 > feed.class
	=> Feedjira::Parser::Versa::JankyPublisher # Sweet! Our parser was chosen to parse the feed.
	:007 > feed.entries.first.url
	=> "http://example.com/heres-an-article" # The url doesn't have a query string!
	:008 > feed.entries.first.published
	=> 2009-09-06 16:20:00 UTC # The published date is the correct one
	# Rails initializer for the Feedjira gem. It should live in
	# config/initializers/feedjira.rb
	#
	# Our app/parsers folder it auto loaded so these classes should be available here.
	[Feedjira::Parser::Versa::JankyPublisher,
	# Feedjira::Parser::Versa::AnotherPublisher,
	# Feedjira::Parser::Versa::AThirdPublisher
	].each{ \|parser\| Feedjira::Feed.add_feed_class parser }
	<?xml version="1.0" encoding="UTF-8" ?>
	<rss version="2.0">
	<channel>
	<title>A Janky RSS Feed</title>
	<description>This is an example of an RSS feed</description>
	<link>http://www.jankybutlovablepublisher.com/</link>
	<lastBuildDate>Mon, 06 Sep 2010 00:01:00 +0000 </lastBuildDate>
	<pubDate>Mon, 06 Sep 2009 16:20:00 +0000 </pubDate>
	<ttl>1800</ttl>

	<item>
	<title>Example entry 1</title>
	<description>Here is some text containing an interesting description.</description>
	<link>http://example.com/heres-an-article?source=rss</link>
	<guid>foo</guid>
	<dc:created>2009/09/06 00/00/00</dc:created>
	<pubDate>Mon, 06 Sep 2009 16:20:00 +0000 </pubDate>
	</item>

	<item>
	<title>Example entry 2</title>
	<description>Here is some text containing an interesting description.</description>
	<link>http://example.com/another-article?source=rss</link>
	<guid>bar</guid>
	<dc:created>2009/09/06 00/00/00</dc:created>
	<pubDate>Mon, 06 Sep 2009 16:22:00 +0000 </pubDate>
	</item>

	</channel>
	</rss>
	module Feedjira
	module Parser
	# It's good practice to namespace your parsers, so we'll put
	# this one in the Versa namespace.
	module Versa

	### Entry Parser Class ###
	# This first class is for parsing an individual <item> in the feed.
	# We define it first because our top level parser need to be able to call it.
	# By convention, this class name is the same as our top level parser
	# but with "Entry" appended.
	class JankyPublisherEntry
	include SAXMachine
	include FeedEntryUtilities

	# Declare the fields we want to parse out of the XML feed.
	element :title
	element :link, :as => :url
	element :description, :as => :summary
	element :pubDate, :as => :published
	element :guid, :as => :entry_id

	# We remove the query string from the url by overriding the 'url' method
	# originally defined by including FeedEntryUtilities in our class.
	# (see https://github.com/feedjira/feedjira/blob/master/lib/feedjira/feed_entry_utilities.rb)
	def url
	@url = @url.gsub(/\?.*$/,'')
	end
	end


	### Feed Parser Class ###
	# This class is for parsing the top level feed fields.
	class JankyPublisher
	include SAXMachine
	include FeedUtilities

	# Define the fields we want to parse using SAX Machine declarations
	element :title
	element :link, :as => :url
	element :description

	# Parse all the <item>s in the feed with the class we just defined above
	elements :item, :as => :entries, :class => Versa::JankyPublisherEntry

	attr_accessor :feed_url

	# This method is required by all Feedjira parsers. To decide which
	# parser to use, Feedjira cycles through each parser it knows about
	# and passes the first 2000 characters of the feed to this method.
	#
	# To make sure your parser is only used when it's supposed to be used,
	# test for something unique in those first 2000 characters. URLs seem
	# to be a good choice.
	#
	# This parser, for example, is looking for an occurrence of
	# '<link>https://www.jankybutlovablepublisher.com' which we should
	# only really find in the feed we are targeting.
	def self.able_to_parse?(xml)
	(/<link>http:\/\/www\.jankybutlovablepublisher\.com\// =~ xml)
	end
	end
	end
	end
	end
	# Example spec for testing that your parser works as it's supposed to
	require 'feedjira'
	require './janky_parser'

	Feedjira::Feed.add_feed_class Feedjira::Parser::Versa::JankyPublisher

	describe Feedjira::Parser::Versa::JankyPublisher do
	before :each do
	@janky_feed = File.read(File.join(File.dirname(__FILE__), '/janky_feed.xml'))
	end

	describe '#able_to_parse?' do
	it 'should return true if the <link> tag contains "http://www.jankybutlovablepublisher.com/"' do
	Feedjira::Parser::Versa::JankyPublisher.able_to_parse?(@janky_feed).should be_true
	end
	end

	describe 'the parser' do
	it 'should pull out the entries properly' do
	feed = Feedjira::Feed.parse(@janky_feed)
	feed.entries.first.class.should eq(Feedjira::Parser::Versa::JankyPublisherEntry)
	end
	end
	end

	describe Feedjira::Parser::Versa::JankyPublisher do
	describe 'a parsed entry' do
	janky_feed = File.read(File.join(File.dirname(__FILE__), '/janky_feed.xml'))
	feed = Feedjira::Feed.parse(janky_feed)
	entry = feed.entries.first

	it 'has the correct title' do entry.title.should == "Example entry 1" end
	it 'has the correct url' do entry.url.should == "http://example.com/heres-an-article" end
	it 'has the correct entry_id' do entry.entry_id.should == "foo" end
	it 'has the correct published time' do entry.published.should == Time.parse("Mon, 06 Sep 2009 16:20:00 +0000") end
	end
	end