fuba · December 25, 2008 03:44
diff --git a/config.yaml b/config.yaml
  - module: Filter::EntryFullText::LDRFullFeed
    config:
      force_upgrade: 1
      alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json
      force_alternative_siteinfo: 1
diff --git a/LDRFullFeed.pm b/LDRFullFeed.pm
 package Plagger::Plugin::Filter::EntryFullText::LDRFullFeed;
 use strict;
 use base qw( Plagger::Plugin::Filter::EntryFullText );

 use JSON;
 use Plagger::UserAgent;
 use WebService::Wedata;

 sub class_id { 
    my $self = shift;
    return ($self->conf->{impersonate})
        ? "Filter-EntryFullText"
        : $self->SUPER::class_id;
 }

 sub load_plugins {
    my $self = shift;
    $self->SUPER::load_plugins(@_);
    $self->load_plugin_siteinfo;
 }

 sub load_plugin_siteinfo {
    my $self = shift;
    
    my $siteinfo = $self->_siteinfo;
    
    if ($siteinfo) {
        Plagger->context->log(debug => "Loaded siteinfo");
    }
    else {
        Plagger->context->log(warn => "No siteinfo");
        return;
    }
    
    push @{ $self->{plugins} },
        map { Plagger::Plugin::Filter::EntryFullText::SiteInfo->new($_) } 
        @{$siteinfo};
 }

 sub _siteinfo {
    my $self = shift;
   
    my $ua = Plagger::UserAgent->new;
    my $wedata = WebService::Wedata->new;
    $wedata->{ua} = $ua;
    
    my $i = 0;
    my %priority = qw/
        SBM 1000
        INDIVIDUAL 100
        IND 100
        SUBGENERAL 10
        SUB 10
        GENERAL 1
        GEN 1
    /;
    
    my @rules;
    
    my $db;
    my $items_ref;
    if (eval {
        die if ($self->conf->{force_alternative_siteinfo});
        $db = $wedata->get_database('LDRFullFeed');
    }) {
        $items_ref = $db->get_items;
    }
    else {
        Plagger->context->log(debug => "Wedata seems down");
        my $json = $ua->get(
            $self->conf->{siteinfo} 
        )->decoded_content;
        $json =~ s|^\(||;
        $json =~ s|\)$||;
        $items_ref = from_json($json);
    }
    for my $item (
        sort {
            $a->{data}->{priority} <=> $b->{data}->{priority}
        }
        map {
            $_->{data}->{priority} ||= ($_->{data}->{type})
                ? $priority{$_->{data}->{type}}
                : 0; $_;
        } @{$items_ref}
    ) {
         Plagger->context->log(
            debug => sprintf(
                'siteinfo: %s %s %s',
                $item->{data}->{url},
                $item->{data}->{xpath},
                $item->{data}->{type}
            )
        );
        push @rules, {
            handle => $item->{data}->{url},
            extract_xpath => {
                body => $item->{data}->{xpath}
            },
        };
    }

    return (@rules) ? \@rules : undef;
 }

 package Plagger::Plugin::Filter::EntryFullText::SiteInfo;
 use base 'Plagger::Plugin::Filter::EntryFullText::YAML';

 sub extract {
    my($self, $args) = @_;
    my $data;

    unless ($self->{extract} || $self->{extract_xpath}) {
        Plagger->context->log(error => "SiteInfo doesn't have either 'extract' nor 'extract_xpath'");
        return;
    }

    if ($self->{extract}) {
 	if (my @match = $args->{content} =~ /$self->{extract}/s) {
 	    my @capture = split /\s+/, $self->{extract_capture};
            @capture = ('body') unless @capture;
 	    @{$data}{@capture} = @match;
 	}
    }

    if ($self->{extract_xpath}) {
        eval { require HTML::TreeBuilder::XPath };
        if ($@) {
            Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
            return;
        }

        my $tree = HTML::TreeBuilder::XPath->new;
        $tree->parse($args->{content});
        $tree->eof;

        for my $capture (keys %{$self->{extract_xpath}}) {
            my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
            if (@children) {
                no warnings 'redefine';
                local *HTML::Element::_xml_escape = \&xml_escape;
                my $body = '';
                for my $child (@children) {
                    $body .= $child->isElementNode
                        ? $child->as_XML
                        : $child->getValue;
                }
                $data->{capture} = $body;
            } else {
                Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
            }
        }
    }

    if ($data) {
        if ($self->{extract_after_hook}) {
            eval $self->{extract_after_hook};
            Plagger->context->error($@) if $@;
        }

        if ($data->{date}) {
            if (my $format = $self->{extract_date_format}) {
                $format = [ $format ] unless ref $format;
                $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
                if ($data->{date} && $self->{extract_date_timezone}) {
                    $data->{date}->set_time_zone($self->{extract_date_timezone});
                }
            } else {
                $data->{date} = Plagger::Date->parse_dwim($data->{date});
            }
        }

        return $data;
    }
 }

 sub xml_escape {
    for my $x (@_) {
        $x = Plagger::Util::encode_xml($x);
    }
 }


 1;

 __END__

 =head1 NAME

 Plagger::Plugin::Filter::EntryFullText::LDRFullFeed - Upgrade feeds to fulltext class by using LDRFullFeed siteinfo

 =head1 SYNOPSIS

  - module: Filter::EntryFullText::LDRFullFeed
    config:
      force_upgrade: 1
      alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json

 =head1 DESCRIPTION

 =head1 CONFIG

 =over 4

 =item impersonate

 =item store_html_on_failure

 =item force_upgrade

 =item alternative_siteinfo

 This module uses alternative siteinfo if Wedata down.

 =item force_alternative_siteinfo

 =back

 =head1 AUTHOR

 fuba

 =head1 SEE ALSO

 L<Plagger>, L<Plagger::Plugin::Filter::EntryFullText>
	- module: Filter::EntryFullText::LDRFullFeed
	config:
	force_upgrade: 1
	alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json
	force_alternative_siteinfo: 1
	package Plagger::Plugin::Filter::EntryFullText::LDRFullFeed;
	use strict;
	use base qw( Plagger::Plugin::Filter::EntryFullText );

	use JSON;
	use Plagger::UserAgent;
	use WebService::Wedata;

	sub class_id {
	my $self = shift;
	return ($self->conf->{impersonate})
	? "Filter-EntryFullText"
	: $self->SUPER::class_id;
	}

	sub load_plugins {
	my $self = shift;
	$self->SUPER::load_plugins(@_);
	$self->load_plugin_siteinfo;
	}

	sub load_plugin_siteinfo {
	my $self = shift;

	my $siteinfo = $self->_siteinfo;

	if ($siteinfo) {
	Plagger->context->log(debug => "Loaded siteinfo");
	}
	else {
	Plagger->context->log(warn => "No siteinfo");
	return;
	}

	push @{ $self->{plugins} },
	map { Plagger::Plugin::Filter::EntryFullText::SiteInfo->new($_) }
	@{$siteinfo};
	}

	sub _siteinfo {
	my $self = shift;

	my $ua = Plagger::UserAgent->new;
	my $wedata = WebService::Wedata->new;
	$wedata->{ua} = $ua;

	my $i = 0;
	my %priority = qw/
	SBM 1000
	INDIVIDUAL 100
	IND 100
	SUBGENERAL 10
	SUB 10
	GENERAL 1
	GEN 1
	/;

	my @rules;

	my $db;
	my $items_ref;
	if (eval {
	die if ($self->conf->{force_alternative_siteinfo});
	$db = $wedata->get_database('LDRFullFeed');
	}) {
	$items_ref = $db->get_items;
	}
	else {
	Plagger->context->log(debug => "Wedata seems down");
	my $json = $ua->get(
	$self->conf->{siteinfo}
	)->decoded_content;
	$json =~ s\|^\(\|\|;
	$json =~ s\|\)$\|\|;
	$items_ref = from_json($json);
	}
	for my $item (
	sort {
	$a->{data}->{priority} <=> $b->{data}->{priority}
	}
	map {
	$_->{data}->{priority} \|\|= ($_->{data}->{type})
	? $priority{$_->{data}->{type}}
	: 0; $_;
	} @{$items_ref}
	) {
	Plagger->context->log(
	debug => sprintf(
	'siteinfo: %s %s %s',
	$item->{data}->{url},
	$item->{data}->{xpath},
	$item->{data}->{type}
	)
	);
	push @rules, {
	handle => $item->{data}->{url},
	extract_xpath => {
	body => $item->{data}->{xpath}
	},
	};
	}

	return (@rules) ? \@rules : undef;
	}

	package Plagger::Plugin::Filter::EntryFullText::SiteInfo;
	use base 'Plagger::Plugin::Filter::EntryFullText::YAML';

	sub extract {
	my($self, $args) = @_;
	my $data;

	unless ($self->{extract} \|\| $self->{extract_xpath}) {
	Plagger->context->log(error => "SiteInfo doesn't have either 'extract' nor 'extract_xpath'");
	return;
	}

	if ($self->{extract}) {
	if (my @match = $args->{content} =~ /$self->{extract}/s) {
	my @capture = split /\s+/, $self->{extract_capture};
	@capture = ('body') unless @capture;
	@{$data}{@capture} = @match;
	}
	}

	if ($self->{extract_xpath}) {
	eval { require HTML::TreeBuilder::XPath };
	if ($@) {
	Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
	return;
	}

	my $tree = HTML::TreeBuilder::XPath->new;
	$tree->parse($args->{content});
	$tree->eof;

	for my $capture (keys %{$self->{extract_xpath}}) {
	my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
	if (@children) {
	no warnings 'redefine';
	local *HTML::Element::_xml_escape = \&xml_escape;
	my $body = '';
	for my $child (@children) {
	$body .= $child->isElementNode
	? $child->as_XML
	: $child->getValue;
	}
	$data->{capture} = $body;
	} else {
	Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
	}
	}
	}

	if ($data) {
	if ($self->{extract_after_hook}) {
	eval $self->{extract_after_hook};
	Plagger->context->error($@) if $@;
	}

	if ($data->{date}) {
	if (my $format = $self->{extract_date_format}) {
	$format = [ $format ] unless ref $format;
	$data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
	if ($data->{date} && $self->{extract_date_timezone}) {
	$data->{date}->set_time_zone($self->{extract_date_timezone});
	}
	} else {
	$data->{date} = Plagger::Date->parse_dwim($data->{date});
	}
	}

	return $data;
	}
	}

	sub xml_escape {
	for my $x (@_) {
	$x = Plagger::Util::encode_xml($x);
	}
	}


	1;

	__END__

	=head1 NAME

	Plagger::Plugin::Filter::EntryFullText::LDRFullFeed - Upgrade feeds to fulltext class by using LDRFullFeed siteinfo

	=head1 SYNOPSIS

	- module: Filter::EntryFullText::LDRFullFeed
	config:
	force_upgrade: 1
	alternative_siteinfo: http://utatane.appjet.net/databases/LDRFullFeed/items.json

	=head1 DESCRIPTION

	=head1 CONFIG

	=over 4

	=item impersonate

	=item store_html_on_failure

	=item force_upgrade

	=item alternative_siteinfo

	This module uses alternative siteinfo if Wedata down.

	=item force_alternative_siteinfo

	=back

	=head1 AUTHOR

	fuba

	=head1 SEE ALSO

	L<Plagger>, L<Plagger::Plugin::Filter::EntryFullText>