Created
December 25, 2008 03:44
-
-
Save fuba/39847 to your computer and use it in GitHub Desktop.
いま自分ではつかってないのでメンテナやる人がいたらてきとうにforkしてください、ダイアリーとかからも誘導しときます
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- module: Filter::EntryFullText::LDRFullFeed | |
config: | |
force_upgrade: 1 | |
alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json | |
force_alternative_siteinfo: 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package Plagger::Plugin::Filter::EntryFullText::LDRFullFeed; | |
use strict; | |
use base qw( Plagger::Plugin::Filter::EntryFullText ); | |
use JSON; | |
use Plagger::UserAgent; | |
use WebService::Wedata; | |
sub class_id { | |
my $self = shift; | |
return ($self->conf->{impersonate}) | |
? "Filter-EntryFullText" | |
: $self->SUPER::class_id; | |
} | |
sub load_plugins { | |
my $self = shift; | |
$self->SUPER::load_plugins(@_); | |
$self->load_plugin_siteinfo; | |
} | |
sub load_plugin_siteinfo { | |
my $self = shift; | |
my $siteinfo = $self->_siteinfo; | |
if ($siteinfo) { | |
Plagger->context->log(debug => "Loaded siteinfo"); | |
} | |
else { | |
Plagger->context->log(warn => "No siteinfo"); | |
return; | |
} | |
push @{ $self->{plugins} }, | |
map { Plagger::Plugin::Filter::EntryFullText::SiteInfo->new($_) } | |
@{$siteinfo}; | |
} | |
sub _siteinfo { | |
my $self = shift; | |
my $ua = Plagger::UserAgent->new; | |
my $wedata = WebService::Wedata->new; | |
$wedata->{ua} = $ua; | |
my $i = 0; | |
my %priority = qw/ | |
SBM 1000 | |
INDIVIDUAL 100 | |
IND 100 | |
SUBGENERAL 10 | |
SUB 10 | |
GENERAL 1 | |
GEN 1 | |
/; | |
my @rules; | |
my $db; | |
my $items_ref; | |
if (eval { | |
die if ($self->conf->{force_alternative_siteinfo}); | |
$db = $wedata->get_database('LDRFullFeed'); | |
}) { | |
$items_ref = $db->get_items; | |
} | |
else { | |
Plagger->context->log(debug => "Wedata seems down"); | |
my $json = $ua->get( | |
$self->conf->{siteinfo} | |
)->decoded_content; | |
$json =~ s|^\(||; | |
$json =~ s|\)$||; | |
$items_ref = from_json($json); | |
} | |
for my $item ( | |
sort { | |
$a->{data}->{priority} <=> $b->{data}->{priority} | |
} | |
map { | |
$_->{data}->{priority} ||= ($_->{data}->{type}) | |
? $priority{$_->{data}->{type}} | |
: 0; $_; | |
} @{$items_ref} | |
) { | |
Plagger->context->log( | |
debug => sprintf( | |
'siteinfo: %s %s %s', | |
$item->{data}->{url}, | |
$item->{data}->{xpath}, | |
$item->{data}->{type} | |
) | |
); | |
push @rules, { | |
handle => $item->{data}->{url}, | |
extract_xpath => { | |
body => $item->{data}->{xpath} | |
}, | |
}; | |
} | |
return (@rules) ? \@rules : undef; | |
} | |
package Plagger::Plugin::Filter::EntryFullText::SiteInfo; | |
use base 'Plagger::Plugin::Filter::EntryFullText::YAML'; | |
sub extract { | |
my($self, $args) = @_; | |
my $data; | |
unless ($self->{extract} || $self->{extract_xpath}) { | |
Plagger->context->log(error => "SiteInfo doesn't have either 'extract' nor 'extract_xpath'"); | |
return; | |
} | |
if ($self->{extract}) { | |
if (my @match = $args->{content} =~ /$self->{extract}/s) { | |
my @capture = split /\s+/, $self->{extract_capture}; | |
@capture = ('body') unless @capture; | |
@{$data}{@capture} = @match; | |
} | |
} | |
if ($self->{extract_xpath}) { | |
eval { require HTML::TreeBuilder::XPath }; | |
if ($@) { | |
Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@"); | |
return; | |
} | |
my $tree = HTML::TreeBuilder::XPath->new; | |
$tree->parse($args->{content}); | |
$tree->eof; | |
for my $capture (keys %{$self->{extract_xpath}}) { | |
my @children = $tree->findnodes($self->{extract_xpath}->{$capture}); | |
if (@children) { | |
no warnings 'redefine'; | |
local *HTML::Element::_xml_escape = \&xml_escape; | |
my $body = ''; | |
for my $child (@children) { | |
$body .= $child->isElementNode | |
? $child->as_XML | |
: $child->getValue; | |
} | |
$data->{capture} = $body; | |
} else { | |
Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}"); | |
} | |
} | |
} | |
if ($data) { | |
if ($self->{extract_after_hook}) { | |
eval $self->{extract_after_hook}; | |
Plagger->context->error($@) if $@; | |
} | |
if ($data->{date}) { | |
if (my $format = $self->{extract_date_format}) { | |
$format = [ $format ] unless ref $format; | |
$data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0]; | |
if ($data->{date} && $self->{extract_date_timezone}) { | |
$data->{date}->set_time_zone($self->{extract_date_timezone}); | |
} | |
} else { | |
$data->{date} = Plagger::Date->parse_dwim($data->{date}); | |
} | |
} | |
return $data; | |
} | |
} | |
sub xml_escape { | |
for my $x (@_) { | |
$x = Plagger::Util::encode_xml($x); | |
} | |
} | |
1; | |
__END__ | |
=head1 NAME | |
Plagger::Plugin::Filter::EntryFullText::LDRFullFeed - Upgrade feeds to fulltext class by using LDRFullFeed siteinfo | |
=head1 SYNOPSIS | |
- module: Filter::EntryFullText::LDRFullFeed | |
config: | |
force_upgrade: 1 | |
alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json | |
=head1 DESCRIPTION | |
=head1 CONFIG | |
=over 4 | |
=item impersonate | |
=item store_html_on_failure | |
=item force_upgrade | |
=item alternative_siteinfo | |
This module uses alternative siteinfo if Wedata down. | |
=item force_alternative_siteinfo | |
=back | |
=head1 AUTHOR | |
fuba | |
=head1 SEE ALSO | |
L<Plagger>, L<Plagger::Plugin::Filter::EntryFullText> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment