Skip to content

Instantly share code, notes, and snippets.

@emasaka
Created September 26, 2011 12:12
Show Gist options
  • Select an option

  • Save emasaka/1242098 to your computer and use it in GitHub Desktop.

Select an option

Save emasaka/1242098 to your computer and use it in GitHub Desktop.
package Plagger::Plugin::CustomFeed::Config;
use strict;
use base qw( Plagger::Plugin );
use DirHandle;
use Encode;
use File::Spec;
use List::Util qw(first);
use Plagger::Date; # for metadata in plugins
use Plagger::Util qw( decode_content );
use Plagger::UserAgent;
our $VERSION = 0.02;
sub register {
my($self, $context) = @_;
$context->register_hook(
$self,
'customfeed.handle' => \&handle,
);
}
sub init {
my $self = shift;
$self->SUPER::init(@_);
$self->load_plugins();
$self->{ua} = Plagger::UserAgent->new;
}
sub load_plugins {
my $self = shift;
my $context = Plagger->context;
$self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) });
$self->load_assets('*.pl', sub { $self->load_plugin_perl(@_) });
}
sub load_plugin_perl {
my($self, $file, $base) = @_;
Plagger->context->log(debug => "Load plugin $file");
open my $fh, '<', $file or Plagger->context->error("$file: $!");
(my $pkg = $base) =~ s/\.pl$//;
my $plugin_class = "Plagger::Plugin::CustomFeed::Config::Site::$pkg";
if ($plugin_class->can('new')) {
Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
return $plugin_class->new;
}
my $code = join '', <$fh>;
unless ($code =~ /^\s*package/s) {
$code = join "\n",
( "package $plugin_class;",
"use strict;",
"use base qw( Plagger::Plugin::CustomFeed::Config::Site );",
"sub site_name { '$pkg' }",
$code,
"1;" );
}
eval $code;
Plagger->context->error($@) if $@;
push @{ $self->{plugins} }, $plugin_class->new;
}
sub load_plugin_yaml {
my($self, $file, $base) = @_;
Plagger->context->log(debug => "Load YAML $file");
my @data = YAML::LoadFile($file);
push @{ $self->{plugins} },
map { Plagger::Plugin::CustomFeed::Config::YAML->new($_, $base) } @data;
}
sub handle {
my($self, $context, $args) = @_;
# NoNetwork: don't connect for 3 hours
my $res = $self->{ua}->fetch( $args->{feed}->url, $self, { NoNetwork => 60 * 60 * 3 } );
if (!$res->status && $res->is_error) {
$context->log(debug => "Fetch " . $args->{feed}->url . " failed");
return;
}
$args->{content} = decode_content($res);
# if the request was redirected, set it as feed url
if ($res->http_response) {
my $base = $res->http_response->request->uri;
if ( $base ne $args->{feed}->url ) {
$context->log(info => "rewrite url to $base");
$args->{feed}->url($base);
}
}
my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
if ($handler && $args->{feed}->url !~ /output=(?:rss|atom)/) {
$context->log(debug => $args->{feed}->url . " custom_feed_handle by " . $handler->site_name);
return $handler->aggregate($context, $args);
}
return;
}
package Plagger::Plugin::CustomFeed::Config::Site;
sub new { bless {}, shift }
sub custom_feed_handle { 0 }
sub custom_feed_follow_link { }
sub custom_feed_follow_xpath { }
package Plagger::Plugin::CustomFeed::Config::YAML;
use Encode;
use List::Util qw(first);
use Plagger::Util qw( decode_content extract_title );
sub new {
my($class, $data, $base) = @_;
# old version compatible
for my $key ( qw(match) ) {
next unless defined $data->{$key};
$data->{custom_feed_handle} = $data->{$key};
}
for my $key ( qw(extract extract_date_format extract_date_timezone extract_capture extract_xpath extract_after_hook) ) {
next unless defined $data->{$key};
if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
$data->{'custom_feed_' . $key} = [ map $_, @{$data->{$key}} ];
} else {
$data->{'custom_feed_' . $key} = $data->{$key};
}
}
# add ^ if handle method starts with http://
for my $key ( qw(custom_feed_handle) ) {
next unless defined $data->{$key};
$data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
}
if ($YAML::VERSION < '0.71') {
# decode as UTF-8
for my $key ( qw(custom_feed_extract custom_feed_extract_date_format custom_feed_extract_after_hook) ) {
next unless defined $data->{$key};
if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
$data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
} else {
$data->{$key} = decode("UTF-8", $data->{$key});
}
}
}
bless {%$data, base => $base }, $class;
}
sub site_name {
my $self = shift;
$self->{base};
}
sub custom_feed_handle {
my($self, $args) = @_;
$self->{custom_feed_handle} ?
$args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
}
sub xml_escape {
for my $x (@_) {
$x = Plagger::Util::encode_xml($x);
}
}
sub aggregate {
my($self, $context, $args) = @_;
unless ($self->{custom_feed_extract} || $self->{custom_feed_extract_xpath}) {
$context->log(error => "YAML doesn't have either 'custom_feed_extract' nor 'custom_feed_extract_xpath'");
return;
}
my $feed = Plagger::Feed->new;
$feed->title($args->{feed}->title || extract_title($args->{content}));
$feed->link($args->{feed}->url);
my $prev_pos = 0;
my $cur_pos = 0;
my %nodes = ();
if ($self->{custom_feed_extract_xpath}) {
eval { require HTML::TreeBuilder::XPath };
if ($@) {
$context->log(error => "HTML::TreeBuilder::XPath is required. $@");
return;
}
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse($args->{content});
$tree->eof;
for my $capture (keys %{$self->{custom_feed_extract_xpath}}) {
@{$nodes{$capture}} = $tree->findnodes($self->{custom_feed_extract_xpath}->{$capture});
unless (@{$nodes{$capture}}) {
$context->log(error => "Can't find node matching $self->{custom_feed_extract_xpath}->{$capture}");
}
}
}
while (1) {
my $data;
if ($self->{custom_feed_extract}) {
my $extract = decode_content($self->{custom_feed_extract});
if ($args->{content} =~ /$extract/sg) {
$cur_pos = pos $args->{content};
my $str = substr($args->{content}, $prev_pos, length($args->{content}));
if (my @match = $str =~ /$extract/s) {
my @capture = split /\s+/, $self->{custom_feed_extract_capture};
for my $m (@match) {
my $val = shift @capture;
$data->{$val} = $data->{$val} . $m;
}
}
$prev_pos = $cur_pos;
}
}
if (%nodes) {
for my $capture (keys %{$self->{custom_feed_extract_xpath}}) {
no warnings 'redefine';
local *HTML::Element::_xml_escape = \&xml_escape;
my $children = shift @{$nodes{$capture}};
if ($children) {
$data->{$capture} = $children->isElementNode
? $children->as_XML
: $children->getValue;;
}
}
}
unless ($data) {
last;
}
if ($self->{custom_feed_extract_after_hook}) {
eval $self->{custom_feed_extract_after_hook};
$context->error($@) if $@;
}
unless ($data->{title} || $data->{link}) {
$context->log(error => "doesn't have either 'title' nor 'link'");
return;
}
if ($data->{date}) {
if (my $format = $self->{custom_feed_extract_date_format}) {
$format = [ $format ] unless ref $format;
$data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
if ($data->{date} && $self->{custom_feed_extract_date_timezone}) {
$data->{date}->set_time_zone($self->{custom_feed_extract_date_timezone});
}
} else {
$data->{date} = Plagger::Date->parse_dwim($data->{date});
}
}
my $entry = Plagger::Entry->new;
$entry->id($data->{link});
$entry->link($data->{link});
$entry->title($data->{title});
$entry->body($data->{body}) if $data->{body};
$entry->author($data->{author}) if $data->{author};
$entry->icon({ url => $data->{icon} }) if $data->{icon};
$entry->summary($data->{summary}) if $data->{summary};
# extract date using found one
if ($data->{date}) {
$entry->date($data->{date});
}
$feed->add_entry($entry);
$context->log(info => "Add $data->{link} ($data->{title})");
}
$context->update->add($feed);
return 1;
}
1;
__END__
=head1 NAME
Plagger::Plugin::CustomFeed::Config - Configurable way to create title and link only custom feeds
=head1 SYNOPSIS
- module: Subscription::Config
config:
feed:
- http://www.softantenna.com/index.html
- module: CustomFeed::Config
=head1 DESCRIPTION
This plugin creates a custom feed off of HTML pages.
Use with EntryFullText plugin to get full content and accurate
datetime of articles.
You can write custom feed handler by putting C<.pl> or C<.yaml>
files under assets plugin directory.
=head1 AUTHOR
Kazushi Tominaga
=head1 SEE ALSO
L<Plagger>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment