|
#!/usr/bin/perl |
|
use strict; |
|
use warnings; |
|
use Config::Pit; |
|
use WWW::Mechanize; |
|
use Web::Scraper; |
|
use XML::Feed; |
|
use DateTime; |
|
use DateTime::Format::MySQL; |
|
use Cache::File; |
|
|
|
my $config = pit_get("www.naver.com", require => { |
|
id => "your id on example", |
|
pw => "your pw on example", |
|
}); |
|
|
|
my $url = "http://nid.naver.com/nidlogin.login"; |
|
my $mech = WWW::Mechanize->new(); |
|
$mech->get($url); |
|
my $res = $mech->submit_form( |
|
form_name => 'frmNIDLogin', |
|
fields => { id => $config->{id}, pw => $config->{pw} }, |
|
); |
|
|
|
my $cafeName = "your cafe name"; |
|
my $cafeLink = "http://cafe.naver.com/[cafe]"; |
|
my $cafeId = "11192577"; |
|
my $feed = XML::Feed->new('Atom'); |
|
$feed->title($cafeName); |
|
$feed->link($cafeLink); |
|
$feed->description("Generated by AtomGen / JEEN"); |
|
$feed->author($cafeName); |
|
$feed->id("tag:cafe.naver.com:$cafeId"); |
|
$feed->modified(DateTime::Format::W3CDTF->parse_datetime(DateTime->now( time_zone => 'Asia/Tokyo' ) . '+09:00')); |
|
|
|
open my $fw, "<", "/home/[your home]/tools/last_insert_id.ini" or die $!; |
|
my $ini = join("", <$fw>); |
|
$ini =~ s/[\r\n]//g;; |
|
close $fw; |
|
|
|
my $cache = Cache::File->new( |
|
cache_root => "/tmp/atomgen", |
|
default_expires => '1 hour', |
|
) or die $!; |
|
|
|
my $article_id = $ini - 19; |
|
while($article_id++ >= 0) { |
|
my $uri = sprintf("http://cafe.naver.com/ArticleRead.nhn?articleid=%s&clubid=%s", $article_id, $cafeId); |
|
|
|
my $c; |
|
unless ($cache->exists($uri)) { |
|
$mech->get($uri); |
|
$c = scraper { |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td", title => 'TEXT'; |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category => 'TEXT'; |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category_link => '@href'; |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fr", registered_at => 'TEXT'; |
|
process "table>tr>td[class=\"p-nick\"]", author => "TEXT"; |
|
process "div[id=\"tbody\"]", content => "HTML"; |
|
process "div[id=\"tbody\"]", summary => "TEXT"; |
|
}->scrape($mech->content); |
|
unless ($c->{title}) { |
|
next if $article_id < $ini; |
|
--$article_id; |
|
last; |
|
} |
|
$cache->freeze($uri, $c); |
|
} else { |
|
$c = $cache->thaw($uri); |
|
} |
|
|
|
my $row = { |
|
articleId => $article_id, |
|
title => $c->{title}, |
|
category => $c->{category}, |
|
author => $c->{author}, |
|
category_link => $c->{category_link}, |
|
registered_at => $c->{registered_at}, |
|
content => $c->{content}, |
|
summary => $c->{summary}, |
|
}; |
|
$row->{registered_at} =~ s/^.\///; |
|
$row->{registered_at} =~ s/\s+$//; |
|
my $entry = XML::Feed::Entry->new('Atom'); |
|
my $content = XML::Feed::Content->new({ body => $row->{content} }); |
|
$entry->title($row->{title}); |
|
$entry->link("http://cafe.naver.com/itcareer/$article_id"); |
|
$entry->summary($row->{summary}); |
|
$entry->author($row->{author}); |
|
my $date = $row->{registered_at}; |
|
my ($yy, $mm, $dd, $hh, $ii) = $date =~ /^(....)\.(.{1,2})\.(.{1,2}) (..):(..)$/; |
|
eval { |
|
$entry->issued(DateTime::Format::W3CDTF->parse_datetime(DateTime::Format::MySQL->parse_datetime(sprintf("%04d-%02d-%02d %02d:%02d:00", $yy, $mm, $dd, $hh, $ii)) . '+09:00')); |
|
}; |
|
if ($@) { print $row->{registered_at}; } |
|
$entry->content($content); |
|
$feed->add_entry($entry); |
|
} |
|
|
|
open my $fh, ">", "/home/[your home]/iij-feed.xml"; |
|
print $fh $feed->as_xml; |
|
close $fh; |
|
|
|
open my $fh2, ">", "/home/[your home]/tools/last_insert_id.ini"; |
|
print $fh2 $article_id; |
|
close $fh; |