Skip to content

Instantly share code, notes, and snippets.

@sonota88
Created February 20, 2011 21:22
Show Gist options
  • Save sonota88/836322 to your computer and use it in GitHub Desktop.
Save sonota88/836322 to your computer and use it in GitHub Desktop.
yapra/plugins/feed/custom_plus.rb / Extract feed using user defined XPath or hAtom or LDRize siteinfo.
# -*- coding: utf-8 -*-
#require 'pp'
require 'open-uri'
require 'time'
require 'rss'
require 'rubygems'
require 'yapra/plugin/mechanize_base'
module LDRize
extend self
SITEINFO_EXPIRE_HOUR = 24
SITEINFO_CACHE_PATH = File.expand_path("~/.custom_plus_ldrize_siteinfo.yaml")
SITEINFO_URLS = "http://wedata.net/databases/LDRize/items.json"
def siteinfo_cache_expired?
Time.now - File.mtime(SITEINFO_CACHE_PATH) > SITEINFO_EXPIRE_HOUR * 60 * 60
end
def siteinfo(page)
if ( not File.exist? SITEINFO_CACHE_PATH ) || siteinfo_cache_expired?
open(SITEINFO_CACHE_PATH, "w"){ |f|
f.print open(SITEINFO_URLS).read
}
end
siteinfo_list = YAML.load( File.read(SITEINFO_CACHE_PATH) )
si = siteinfo_list.select{|s|
begin
re = Regexp.compile( s["data"]["domain"] )
rescue => e
p [ e.message, s["data"]["domain"] ]
end
# domain のマッチだけではなく xpath も
re =~ page.uri.to_s
}.first
data = si["data"]
description = data["description"] || "."
title = data["title"] || data["link"]
xconfig = {
"capture" => "//body",
"split" => data["paragraph"].gsub('\\/', "/"),
"title" => title.gsub('\\/', "/"),
"link" => data["link"].gsub('\\/', "/"),
"description" => description.gsub('\\/', "/"),
}
xconfig
end
end
module Yapra::Plugin::Feed
# require:
#
# Feed::Extract
#
# example:
#
# config:
# url: URL
#
# return:
#
# data: (feed items / item == instance of RSS::RDF::Item)
# - title: title
# link: link
# description: description
# date: pub_date
# - title: ...
# ...
#
# see also:
#
# hAtom 0.1 · Microformats Wiki
# http://microformats.org/wiki/hatom
class CustomPlus < Yapra::Plugin::MechanizeBase
def shorten(str, length)
if str.respond_to? :inner_text
str = str.inner_text
end
str = str.gsub(/\s+/, " ").strip
chars = str.split(//u)
if chars.size > length
chars = chars[0..(length-4)]
chars << " ..."
end
chars.join("")
end
def extract(url, xconfig)
pipeline.execute_plugin(
{ "module" => "Feed::Extract",
"config" => {
"url" => url,
"extract_xpath" => xconfig
}
},
nil)
end
def run(data)
urls = if config['url'].kind_of?(Array)
config['url']
else
[ config['url'] ]
end
result = []
urls.each do |url|
page = agent.get(url)
# Try xconfig or hAtom
xconfig = config["extract_xpath"]
items = extract(url, xconfig)
# or try LDRize siteinfo
if items.nil? || items.empty?
logger.debug "use LDRize siteinfo"
xconfig = LDRize.siteinfo(page)
logger.debug "LDRize siteinfo => " + xconfig.inspect
items = extract(url, xconfig)
if items.nil? || items.empty?
items = nil
end
end
items.each{|item|
if item.title.nil?
item.title = shorten(item.description, 100)
if /\A\s*\Z/ =~ item.title
item.title = "(title not found)"
end
end
}
items.each{|item|
item.date = Time.now if item.date.nil?
}
result += items
end
result
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment