|
use strict; |
|
use warnings; |
|
use LWP::UserAgent; |
|
use Config::Pit; |
|
use WWW::Mechanize; |
|
use Web::Scraper; |
|
use URI; |
|
use Data::Dumper; |
|
use XML::Feed; |
|
use DateTime; |
|
use DateTime::Format::MySQL; |
|
|
|
my $config = pit_get("www.naver.com", require => { |
|
id => "your id on example", |
|
pw => "your pw on example", |
|
}); |
|
|
|
my $url = "http://nid.naver.com/nidlogin.login"; |
|
my $mech = WWW::Mechanize->new(); |
|
$mech->get($url); |
|
my $res = $mech->submit_form( |
|
form_name => 'frmNIDLogin', |
|
fields => { id => $config->{id}, pw => $config->{pw} }, |
|
); |
|
|
|
my $feed = XML::Feed->new('Atom'); |
|
$feed->title("IIJ"); |
|
$feed->link("http://cafe.naver.com/itcareer"); |
|
$feed->description(""); |
|
$feed->author("IIJ Members"); |
|
$feed->id("tag:cafe.naver.com:itcareer"); |
|
$feed->modified(DateTime::Format::W3CDTF->parse_datetime(DateTime->now( time_zone => 'Asia/Tokyo' ) . '+09:00')); |
|
my $article_id = 18735; |
|
while($article_id++ >= 0) { |
|
$mech->get(sprintf("http://cafe.naver.com/ArticleRead.nhn?articleid=%s&clubid=11192577", $article_id)); |
|
my $c = scraper { |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td", title => 'TEXT'; |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category => 'TEXT'; |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category_link => '@href'; |
|
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fr", registered_at => 'TEXT'; |
|
process "table>tr>td[class=\"p-nick\"]", author => "TEXT"; |
|
process "div[id=\"tbody\"]", content => "HTML"; |
|
}->scrape($mech->content); |
|
last unless $c->{title}; |
|
my $row = { |
|
articleId => $article_id, |
|
title => $c->{title}, |
|
category => $c->{category}, |
|
author => $c->{author}, |
|
category_link => $c->{category_link}, |
|
registered_at => $c->{registered_at}, |
|
content => $c->{content}, |
|
}; |
|
$row->{registered_at} =~ s/^.\///; |
|
$row->{registered_at} =~ s/\s+$//; |
|
my $entry = XML::Feed::Entry->new('Atom'); |
|
my $content = XML::Feed::Content->new({ body => $row->{content} }); |
|
$entry->title($row->{title}); |
|
$entry->link("http://cafe.naver.com/itcareer/$article_id"); |
|
$entry->summary(".."); |
|
$entry->author($row->{author}); |
|
my $date = $row->{registered_at}; |
|
my ($yy, $mm, $dd, $hh, $ii) = $date =~ /^(....)\.(.{1,2})\.(.{1,2}) (..):(..)$/; |
|
$entry->issued(DateTime::Format::W3CDTF->parse_datetime(DateTime::Format::MySQL->parse_datetime(sprintf("%04d-%02d-%02d %02d:%02d:00", $yy, $mm, $dd, $hh, $ii)) . '+09:00')); |
|
$entry->content($content); |
|
$feed->add_entry($entry); |
|
} |
|
|
|
open my $fh, ">", "feed.xml"; |
|
print $fh $feed->as_xml; |
|
close $fh; |