Skip to content

Instantly share code, notes, and snippets.

@JEEN
Created October 28, 2009 02:03
Show Gist options
  • Save JEEN/220173 to your computer and use it in GitHub Desktop.
Save JEEN/220173 to your computer and use it in GitHub Desktop.
use strict;
use warnings;
use LWP::UserAgent;
use Config::Pit;
use WWW::Mechanize;
use Web::Scraper;
use URI;
use Data::Dumper;
use XML::Feed;
use DateTime;
use DateTime::Format::MySQL;
my $config = pit_get("www.naver.com", require => {
id => "your id on example",
pw => "your pw on example",
});
my $url = "http://nid.naver.com/nidlogin.login";
my $mech = WWW::Mechanize->new();
$mech->get($url);
my $res = $mech->submit_form(
form_name => 'frmNIDLogin',
fields => { id => $config->{id}, pw => $config->{pw} },
);
my $feed = XML::Feed->new('Atom');
$feed->title("IIJ");
$feed->link("http://cafe.naver.com/itcareer");
$feed->description("");
$feed->author("IIJ Members");
$feed->id("tag:cafe.naver.com:itcareer");
$feed->modified(DateTime::Format::W3CDTF->parse_datetime(DateTime->now( time_zone => 'Asia/Tokyo' ) . '+09:00'));
my $article_id = 18735;
while($article_id++ >= 0) {
$mech->get(sprintf("http://cafe.naver.com/ArticleRead.nhn?articleid=%s&clubid=11192577", $article_id));
my $c = scraper {
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td", title => 'TEXT';
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category => 'TEXT';
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category_link => '@href';
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fr", registered_at => 'TEXT';
process "table>tr>td[class=\"p-nick\"]", author => "TEXT";
process "div[id=\"tbody\"]", content => "HTML";
}->scrape($mech->content);
last unless $c->{title};
my $row = {
articleId => $article_id,
title => $c->{title},
category => $c->{category},
author => $c->{author},
category_link => $c->{category_link},
registered_at => $c->{registered_at},
content => $c->{content},
};
$row->{registered_at} =~ s/^.\///;
$row->{registered_at} =~ s/\s+$//;
my $entry = XML::Feed::Entry->new('Atom');
my $content = XML::Feed::Content->new({ body => $row->{content} });
$entry->title($row->{title});
$entry->link("http://cafe.naver.com/itcareer/$article_id");
$entry->summary("..");
$entry->author($row->{author});
my $date = $row->{registered_at};
my ($yy, $mm, $dd, $hh, $ii) = $date =~ /^(....)\.(.{1,2})\.(.{1,2}) (..):(..)$/;
$entry->issued(DateTime::Format::W3CDTF->parse_datetime(DateTime::Format::MySQL->parse_datetime(sprintf("%04d-%02d-%02d %02d:%02d:00", $yy, $mm, $dd, $hh, $ii)) . '+09:00'));
$entry->content($content);
$feed->add_entry($entry);
}
open my $fh, ">", "feed.xml";
print $fh $feed->as_xml;
close $fh;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment