sonota88 · February 20, 2011 21:22
diff --git a/custom_plus.rb b/custom_plus.rb
 # -*- coding: utf-8 -*-
 #require 'pp'
 require 'open-uri'
 require 'time'
 require 'rss'

 require 'rubygems'
 require 'yapra/plugin/mechanize_base'

 module LDRize
  extend self

  SITEINFO_EXPIRE_HOUR = 24
  SITEINFO_CACHE_PATH = File.expand_path("~/.custom_plus_ldrize_siteinfo.yaml")
  SITEINFO_URLS = "http://wedata.net/databases/LDRize/items.json"


  def siteinfo_cache_expired?
    Time.now - File.mtime(SITEINFO_CACHE_PATH) > SITEINFO_EXPIRE_HOUR * 60 * 60
  end
  
  
  def siteinfo(page)
    if ( not File.exist? SITEINFO_CACHE_PATH ) || siteinfo_cache_expired?
      open(SITEINFO_CACHE_PATH, "w"){ |f|
        f.print open(SITEINFO_URLS).read
      }
    end

    siteinfo_list = YAML.load( File.read(SITEINFO_CACHE_PATH) )
    si = siteinfo_list.select{|s|
      begin
        re = Regexp.compile( s["data"]["domain"] )
      rescue => e
        p [ e.message, s["data"]["domain"] ]
      end
      # domain のマッチだけではなく xpath も
      re =~ page.uri.to_s
    }.first

    data = si["data"]
    
    description = data["description"] || "."
    title = data["title"] || data["link"]
    
    xconfig = {
      "capture"     => "//body",
      "split"       => data["paragraph"].gsub('\\/', "/"),
      "title"       => title.gsub('\\/', "/"),
      "link"        => data["link"].gsub('\\/', "/"),
      "description" => description.gsub('\\/', "/"),
    }
    xconfig
  end
 end


 module Yapra::Plugin::Feed

  # require:
  #
  # Feed::Extract
  #
  # example:
  # 
  # config:
  #   url: URL
  # 
  # return:
  # 
  # data: (feed items / item == instance of RSS::RDF::Item)
  #   - title: title
  #     link: link
  #     description: description
  #     date: pub_date
  #   - title: ...
  #     ...
  # 
  # see also:
  # 
  # hAtom 0.1 · Microformats Wiki
  # http://microformats.org/wiki/hatom

  class CustomPlus < Yapra::Plugin::MechanizeBase

    def shorten(str, length)
      if str.respond_to? :inner_text
        str = str.inner_text
      end

      str = str.gsub(/\s+/, " ").strip
      chars = str.split(//u)

      if chars.size > length
        chars = chars[0..(length-4)]
        chars << " ..."
      end

      chars.join("")
    end


    def extract(url, xconfig)
      pipeline.execute_plugin(
                              { "module" => "Feed::Extract",
                                "config" => {
                                  "url" => url,
                                  "extract_xpath" => xconfig
                                }
                              },
                              nil)
    end


    def run(data)
      urls = if config['url'].kind_of?(Array)
              config['url']
            else
              [ config['url'] ]
            end
      result = []

      urls.each do |url|
        page = agent.get(url)
        
        # Try xconfig or hAtom
        xconfig = config["extract_xpath"]
        items = extract(url, xconfig)

        # or try LDRize siteinfo
        if items.nil? || items.empty?
          logger.debug "use LDRize siteinfo"
          xconfig = LDRize.siteinfo(page)
          logger.debug "LDRize siteinfo => " + xconfig.inspect
          items = extract(url, xconfig)
          if items.nil? || items.empty?
            items = nil
          end
        end

        items.each{|item|
          if item.title.nil?
            item.title = shorten(item.description, 100)
            if /\A\s*\Z/ =~ item.title
              item.title = "(title not found)"
            end
          end
        }

        items.each{|item|
          item.date = Time.now if item.date.nil?
        }

        result += items
      end

      result
    end
  end
 end
	# -- coding: utf-8 --
	#require 'pp'
	require 'open-uri'
	require 'time'
	require 'rss'

	require 'rubygems'
	require 'yapra/plugin/mechanize_base'

	module LDRize
	extend self

	SITEINFO_EXPIRE_HOUR = 24
	SITEINFO_CACHE_PATH = File.expand_path("~/.custom_plus_ldrize_siteinfo.yaml")
	SITEINFO_URLS = "http://wedata.net/databases/LDRize/items.json"


	def siteinfo_cache_expired?
	Time.now - File.mtime(SITEINFO_CACHE_PATH) > SITEINFO_EXPIRE_HOUR * 60 * 60
	end


	def siteinfo(page)
	if ( not File.exist? SITEINFO_CACHE_PATH ) \|\| siteinfo_cache_expired?
	open(SITEINFO_CACHE_PATH, "w"){ \|f\|
	f.print open(SITEINFO_URLS).read
	}
	end

	siteinfo_list = YAML.load( File.read(SITEINFO_CACHE_PATH) )
	si = siteinfo_list.select{\|s\|
	begin
	re = Regexp.compile( s["data"]["domain"] )
	rescue => e
	p [ e.message, s["data"]["domain"] ]
	end
	# domain のマッチだけではなく xpath も
	re =~ page.uri.to_s
	}.first

	data = si["data"]

	description = data["description"] \|\| "."
	title = data["title"] \|\| data["link"]

	xconfig = {
	"capture" => "//body",
	"split" => data["paragraph"].gsub('\\/', "/"),
	"title" => title.gsub('\\/', "/"),
	"link" => data["link"].gsub('\\/', "/"),
	"description" => description.gsub('\\/', "/"),
	}
	xconfig
	end
	end


	module Yapra::Plugin::Feed

	# require:
	#
	# Feed::Extract
	#
	# example:
	#
	# config:
	# url: URL
	#
	# return:
	#
	# data: (feed items / item == instance of RSS::RDF::Item)
	# - title: title
	# link: link
	# description: description
	# date: pub_date
	# - title: ...
	# ...
	#
	# see also:
	#
	# hAtom 0.1 · Microformats Wiki
	# http://microformats.org/wiki/hatom

	class CustomPlus < Yapra::Plugin::MechanizeBase

	def shorten(str, length)
	if str.respond_to? :inner_text
	str = str.inner_text
	end

	str = str.gsub(/\s+/, " ").strip
	chars = str.split(//u)

	if chars.size > length
	chars = chars[0..(length-4)]
	chars << " ..."
	end

	chars.join("")
	end


	def extract(url, xconfig)
	pipeline.execute_plugin(
	{ "module" => "Feed::Extract",
	"config" => {
	"url" => url,
	"extract_xpath" => xconfig
	}
	},
	nil)
	end


	def run(data)
	urls = if config['url'].kind_of?(Array)
	config['url']
	else
	[ config['url'] ]
	end
	result = []

	urls.each do \|url\|
	page = agent.get(url)

	# Try xconfig or hAtom
	xconfig = config["extract_xpath"]
	items = extract(url, xconfig)

	# or try LDRize siteinfo
	if items.nil? \|\| items.empty?
	logger.debug "use LDRize siteinfo"
	xconfig = LDRize.siteinfo(page)
	logger.debug "LDRize siteinfo => " + xconfig.inspect
	items = extract(url, xconfig)
	if items.nil? \|\| items.empty?
	items = nil
	end
	end

	items.each{\|item\|
	if item.title.nil?
	item.title = shorten(item.description, 100)
	if /\A\s*\Z/ =~ item.title
	item.title = "(title not found)"
	end
	end
	}

	items.each{\|item\|
	item.date = Time.now if item.date.nil?
	}

	result += items
	end

	result
	end
	end
	end