Created
February 13, 2011 12:57
-
-
Save sonota88/824665 to your computer and use it in GitHub Desktop.
yapra/plugins/feed/extract.rb / Extract feed using user defined XPath or hAtom.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
require 'yapra/plugin/mechanize_base' | |
module Yapra::Plugin::Feed | |
# description: | |
# | |
# Extract as feed from a web page using hAtom or user defined XPath. | |
# | |
# example: | |
# | |
# - module: Feed::Extract | |
# config: | |
# url: http://exmaple.com/ | |
# | |
# or | |
# | |
# - module: Feed::Extract | |
# config: | |
# url: http://exmaple.com/ | |
# extract_xpath: | |
# title: {xpath} | |
# link: {xpath} | |
# description: {xpath} | |
# date: {xpath} | |
# | |
# extract_xpath は必須ではなく、 | |
# 1. まず url だけを指定して取得する | |
# 2. それで取得できない場合、または取得できたものでは不都合な場合は | |
# extract_xpath をユーザが定義する。 | |
# | |
# 備考: | |
# | |
# title, date などが取得できない・不正な場合は | |
# nil を返し、呼び出し元で処理する。 | |
class Extract < Yapra::Plugin::MechanizeBase | |
HATOM_DATE_RE = /^\d{4}-\d{2}-\d{2}T\d?\d:\d{2}[:\.]\d{2}/ | |
def initialize | |
super | |
@hatom_date_xpath = ".//*[" + xpath_contains("class", "updated") + " or " + xpath_contains("class", "published") + "]" | |
@xconfig_for_hatom = { | |
"capture" => "//body", | |
"split" => ".//*[" + xpath_contains("class", "hentry" ) + "]", | |
"title" => ".//*[" + xpath_contains("class", "entry-title" ) + "]", | |
"link" => ".//*[" + xpath_contains("rel", "bookmark" ) + "]", | |
"description" => ".//*[" + xpath_contains("class", "entry-content") + "]", | |
"date" => @hatom_date_xpath | |
} | |
end | |
# Returns XPath that explains {attr} contains {value}. | |
def xpath_contains(attr, value) | |
"contains(concat(' ',normalize-space(@#{attr}),' '), ' #{value} ')" | |
end | |
def title(element, xpath) | |
if element.xpath(xpath).size > 0 | |
element.xpath(xpath)[0].text.strip | |
else | |
nil | |
end | |
end | |
def link(element, xpath, url) | |
href = element.xpath( xpath )[0].attr("href") | |
unless /^http/ =~ href | |
url_base = url.split("/")[0..2].join("/") | |
href = url_base + href | |
end | |
href | |
end | |
def description(element, xpath) | |
xpath ? element.xpath(xpath)[0] : element | |
end | |
def date(element, xpath) | |
if xpath && element.xpath(xpath).size > 0 | |
if @hatom_date_xpath == xpath && | |
HATOM_DATE_RE =~ element.xpath(xpath)[0].attr("title") | |
return Time.parse( element.xpath(xpath)[0].attr("title") ) | |
end | |
date_str = element.xpath(xpath)[0].text | |
begin | |
Time.parse( Date.parse(date_str).to_s ) | |
rescue ArgumentError | |
nil | |
end | |
else | |
nil | |
end | |
end | |
def entries(page, config) | |
xconfig = config["extract_xpath"] ? config["extract_xpath"] : @xconfig_for_hatom | |
capture = xconfig['capture'] | |
root = capture ? page.root.xpath(capture) : page.root | |
root.xpath(xconfig['split']) | |
end | |
def run(data) | |
url = config["url"] | |
page = agent.get(url) | |
xconfig = if config["extract_xpath"] | |
config["extract_xpath"] | |
else | |
@xconfig_for_hatom | |
end | |
entries(page, config).map{|entry| | |
item = RSS::RDF::Item.new | |
item.title = title(entry, xconfig["title"]) | |
item.link = link(entry, xconfig["link"], url) | |
item.description = description(entry, xconfig["description"]) | |
item.date = date(entry, xconfig["date"]) | |
item | |
} | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment