Skip to content

Instantly share code, notes, and snippets.

@SebDeclercq
Last active August 29, 2015 14:06
Show Gist options
  • Save SebDeclercq/5ce8954debf9e627e57f to your computer and use it in GitHub Desktop.
Save SebDeclercq/5ce8954debf9e627e57f to your computer and use it in GitHub Desktop.
#! /usr/bin/perl -w
use strict;
use warnings;
use feature 'say';
use LWP::UserAgent;
use XML::RSS;
use HTML::ResolveLink qw/extract_main_html/;
use experimental 'signatures';
use RSS::DB;
my %sources = (
"Demain la Terre" => 'http://www.lesoir.be/feed/Univers/Demain%20la%20Terre/default_univers_block',
"Actu" => 'http://www.lesoir.be/feed/La%20Une/destination_une_block',
"Fil Info" => 'http://rss.feedsportal.com/c/864/f/11087/index.rss'
);
my $ua = LWP::UserAgent->new();
for my $source (keys %sources) {
my $url = $sources{$source};
my $resp = $ua->get($url, content_file => "feed.xml");
my $feed;
if ($resp->is_success) {
$feed = $resp->decoded_content(charset => 'none');
} else {
die $resp->status_line;
}
my $schema = RSS::DB->connect("dbi:SQLite:rss.db");
my $rss = XML::RSS->new;
$rss->parse($feed);
for (@{$rss->{items}}) {
my %rec = scrapAndInsert($_);
$rec{source} = $source;
my %values = map {$_,$rec{$_}} qw/title link content source/;
my $insert = $schema->resultset('Feed')->update_or_create(\%values);
}
}
sub scrapAndInsert ($item) {
my %rec;
$rec{title} = $item->{title};
$rec{link} = $item->{link};
my $resp = $ua->get($rec{link});
my $content;
my $linker = HTML::ResolveLink->new(base => $resp->base());
eval {
$content = extract_main_html($linker->resolve($resp->decoded_content));
}; $content ||= $linker->resolve($resp->decoded_content(charset => 'none'));
if ($content =~ /<div class="ours">(.*?)<\/div>/s) {
$rec{content} = $1."\n";
} elsif ($content =~ /<!-- chapeau -->(.*?)<!-- \/chapeau -->/s ) {
$rec{content} = $1."\n";
}
$rec{content} .= $& if $content =~ /<div class="article-body">(.*?)<\/div>/s;
$rec{content} =~ s/(\s+|&nbsp;)/ /g;
$rec{content} =~ s/<[^>]+>//g;
return %rec;
}
create table feed (
id integer primary key autoincrement not null,
source text not null,
title text unique not null,
link text not null,
content text not null
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment