sonota88 · February 13, 2011 12:57
diff --git a/extract.rb b/extract.rb
 # -*- coding: utf-8 -*-

 require 'yapra/plugin/mechanize_base'

 module Yapra::Plugin::Feed

  # description:
  # 
  # Extract as feed from a web page using hAtom or user defined XPath.
  #
  # example:
  # 
  # - module: Feed::Extract
  #   config: 
  #     url: http://exmaple.com/
  #
  # or
  #
  # - module: Feed::Extract
  #   config: 
  #     url: http://exmaple.com/
  #     extract_xpath:
  #       title: {xpath}
  #       link: {xpath}
  #       description: {xpath}
  #       date: {xpath}
  #
  # extract_xpath は必須ではなく、
  # 1. まず url だけを指定して取得する
  # 2. それで取得できない場合、または取得できたものでは不都合な場合は
  #    extract_xpath をユーザが定義する。
  #
  # 備考:
  # 
  # title, date などが取得できない・不正な場合は
  # nil を返し、呼び出し元で処理する。

  class Extract < Yapra::Plugin::MechanizeBase

    HATOM_DATE_RE = /^\d{4}-\d{2}-\d{2}T\d?\d:\d{2}[:\.]\d{2}/

    def initialize
      super

      @hatom_date_xpath = ".//*[" + xpath_contains("class", "updated") + " or " + xpath_contains("class", "published") + "]"

      @xconfig_for_hatom = {
        "capture"     => "//body",
        "split"       => ".//*[" + xpath_contains("class", "hentry"       ) + "]",
        "title"       => ".//*[" + xpath_contains("class", "entry-title"  ) + "]",
        "link"        => ".//*[" + xpath_contains("rel",   "bookmark"     ) + "]",
        "description" => ".//*[" + xpath_contains("class", "entry-content") + "]",
        "date"        => @hatom_date_xpath
      }
    end
    
    
    # Returns XPath that explains {attr} contains {value}.
    def xpath_contains(attr, value)
      "contains(concat(' ',normalize-space(@#{attr}),' '), ' #{value} ')"
    end

    
    def title(element, xpath)
      if element.xpath(xpath).size > 0
        element.xpath(xpath)[0].text.strip
      else
        nil
      end
    end

    
    def link(element, xpath, url)
      href = element.xpath( xpath )[0].attr("href")
      unless /^http/ =~ href
        url_base = url.split("/")[0..2].join("/")
        href = url_base + href
      end
      href
    end

    
    def description(element, xpath)
      xpath ? element.xpath(xpath)[0] : element
    end

    
    def date(element, xpath)
      if xpath && element.xpath(xpath).size > 0
        if @hatom_date_xpath == xpath &&
            HATOM_DATE_RE =~ element.xpath(xpath)[0].attr("title")
          return Time.parse( element.xpath(xpath)[0].attr("title") )
        end

        date_str = element.xpath(xpath)[0].text
        begin
          Time.parse( Date.parse(date_str).to_s )
        rescue ArgumentError
          nil
        end
      else
        nil
      end
    end

    
    def entries(page, config)
      xconfig = config["extract_xpath"] ? config["extract_xpath"] : @xconfig_for_hatom
      capture = xconfig['capture']

      root = capture ? page.root.xpath(capture) : page.root
      root.xpath(xconfig['split'])
    end

    
    def run(data)
      url = config["url"]
      page = agent.get(url)

      xconfig = if config["extract_xpath"]
                  config["extract_xpath"]
                else
                  @xconfig_for_hatom
                end

      entries(page, config).map{|entry|
        item = RSS::RDF::Item.new
        item.title       = title(entry, xconfig["title"])
        item.link        = link(entry, xconfig["link"], url)
        item.description = description(entry, xconfig["description"])
        item.date        = date(entry, xconfig["date"])
        item
      }
    end
  end
 end
	# -- coding: utf-8 --

	require 'yapra/plugin/mechanize_base'

	module Yapra::Plugin::Feed

	# description:
	#
	# Extract as feed from a web page using hAtom or user defined XPath.
	#
	# example:
	#
	# - module: Feed::Extract
	# config:
	# url: http://exmaple.com/
	#
	# or
	#
	# - module: Feed::Extract
	# config:
	# url: http://exmaple.com/
	# extract_xpath:
	# title: {xpath}
	# link: {xpath}
	# description: {xpath}
	# date: {xpath}
	#
	# extract_xpath は必須ではなく、
	# 1. まず url だけを指定して取得する
	# 2. それで取得できない場合、または取得できたものでは不都合な場合は
	# extract_xpath をユーザが定義する。
	#
	# 備考:
	#
	# title, date などが取得できない・不正な場合は
	# nil を返し、呼び出し元で処理する。

	class Extract < Yapra::Plugin::MechanizeBase

	HATOM_DATE_RE = /^\d{4}-\d{2}-\d{2}T\d?\d:\d{2}[:\.]\d{2}/

	def initialize
	super

	@hatom_date_xpath = ".//*[" + xpath_contains("class", "updated") + " or " + xpath_contains("class", "published") + "]"

	@xconfig_for_hatom = {
	"capture" => "//body",
	"split" => ".//*[" + xpath_contains("class", "hentry" ) + "]",
	"title" => ".//*[" + xpath_contains("class", "entry-title" ) + "]",
	"link" => ".//*[" + xpath_contains("rel", "bookmark" ) + "]",
	"description" => ".//*[" + xpath_contains("class", "entry-content") + "]",
	"date" => @hatom_date_xpath
	}
	end


	# Returns XPath that explains {attr} contains {value}.
	def xpath_contains(attr, value)
	"contains(concat(' ',normalize-space(@#{attr}),' '), ' #{value} ')"
	end


	def title(element, xpath)
	if element.xpath(xpath).size > 0
	element.xpath(xpath)[0].text.strip
	else
	nil
	end
	end


	def link(element, xpath, url)
	href = element.xpath( xpath )[0].attr("href")
	unless /^http/ =~ href
	url_base = url.split("/")[0..2].join("/")
	href = url_base + href
	end
	href
	end


	def description(element, xpath)
	xpath ? element.xpath(xpath)[0] : element
	end


	def date(element, xpath)
	if xpath && element.xpath(xpath).size > 0
	if @hatom_date_xpath == xpath &&
	HATOM_DATE_RE =~ element.xpath(xpath)[0].attr("title")
	return Time.parse( element.xpath(xpath)[0].attr("title") )
	end

	date_str = element.xpath(xpath)[0].text
	begin
	Time.parse( Date.parse(date_str).to_s )
	rescue ArgumentError
	nil
	end
	else
	nil
	end
	end


	def entries(page, config)
	xconfig = config["extract_xpath"] ? config["extract_xpath"] : @xconfig_for_hatom
	capture = xconfig['capture']

	root = capture ? page.root.xpath(capture) : page.root
	root.xpath(xconfig['split'])
	end


	def run(data)
	url = config["url"]
	page = agent.get(url)

	xconfig = if config["extract_xpath"]
	config["extract_xpath"]
	else
	@xconfig_for_hatom
	end

	entries(page, config).map{\|entry\|
	item = RSS::RDF::Item.new
	item.title = title(entry, xconfig["title"])
	item.link = link(entry, xconfig["link"], url)
	item.description = description(entry, xconfig["description"])
	item.date = date(entry, xconfig["date"])
	item
	}
	end
	end
	end