Skip to content

Instantly share code, notes, and snippets.

@JEEN
Created October 30, 2009 06:21
Show Gist options
  • Select an option

  • Save JEEN/222179 to your computer and use it in GitHub Desktop.

Select an option

Save JEEN/222179 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use Config::Pit;
use WWW::Mechanize;
use Web::Scraper;
use XML::Feed;
use DateTime;
use DateTime::Format::MySQL;
use Cache::File;
my $config = pit_get("www.naver.com", require => {
id => "your id on example",
pw => "your pw on example",
});
my $url = "http://nid.naver.com/nidlogin.login";
my $mech = WWW::Mechanize->new();
$mech->get($url);
my $res = $mech->submit_form(
form_name => 'frmNIDLogin',
fields => { id => $config->{id}, pw => $config->{pw} },
);
my $cafeName = "your cafe name";
my $cafeLink = "http://cafe.naver.com/[cafe]";
my $cafeId = "11192577";
my $feed = XML::Feed->new('Atom');
$feed->title($cafeName);
$feed->link($cafeLink);
$feed->description("Generated by AtomGen / JEEN");
$feed->author($cafeName);
$feed->id("tag:cafe.naver.com:$cafeId");
$feed->modified(DateTime::Format::W3CDTF->parse_datetime(DateTime->now( time_zone => 'Asia/Tokyo' ) . '+09:00'));
open my $fw, "<", "/home/[your home]/tools/last_insert_id.ini" or die $!;
my $ini = join("", <$fw>);
$ini =~ s/[\r\n]//g;;
close $fw;
my $cache = Cache::File->new(
cache_root => "/tmp/atomgen",
default_expires => '1 hour',
) or die $!;
my $article_id = $ini - 19;
while($article_id++ >= 0) {
my $uri = sprintf("http://cafe.naver.com/ArticleRead.nhn?articleid=%s&clubid=%s", $article_id, $cafeId);
my $c;
unless ($cache->exists($uri)) {
$mech->get($uri);
$c = scraper {
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td", title => 'TEXT';
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category => 'TEXT';
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fl>table>tr>td+td+td>a", category_link => '@href';
process "div[id=\"post_${article_id}\"]>div.inbox>div[class=\"tit-box\"]>div.fr", registered_at => 'TEXT';
process "table>tr>td[class=\"p-nick\"]", author => "TEXT";
process "div[id=\"tbody\"]", content => "HTML";
process "div[id=\"tbody\"]", summary => "TEXT";
}->scrape($mech->content);
unless ($c->{title}) {
next if $article_id < $ini;
--$article_id;
last;
}
$cache->freeze($uri, $c);
} else {
$c = $cache->thaw($uri);
}
my $row = {
articleId => $article_id,
title => $c->{title},
category => $c->{category},
author => $c->{author},
category_link => $c->{category_link},
registered_at => $c->{registered_at},
content => $c->{content},
summary => $c->{summary},
};
$row->{registered_at} =~ s/^.\///;
$row->{registered_at} =~ s/\s+$//;
my $entry = XML::Feed::Entry->new('Atom');
my $content = XML::Feed::Content->new({ body => $row->{content} });
$entry->title($row->{title});
$entry->link("http://cafe.naver.com/itcareer/$article_id");
$entry->summary($row->{summary});
$entry->author($row->{author});
my $date = $row->{registered_at};
my ($yy, $mm, $dd, $hh, $ii) = $date =~ /^(....)\.(.{1,2})\.(.{1,2}) (..):(..)$/;
eval {
$entry->issued(DateTime::Format::W3CDTF->parse_datetime(DateTime::Format::MySQL->parse_datetime(sprintf("%04d-%02d-%02d %02d:%02d:00", $yy, $mm, $dd, $hh, $ii)) . '+09:00'));
};
if ($@) { print $row->{registered_at}; }
$entry->content($content);
$feed->add_entry($entry);
}
open my $fh, ">", "/home/[your home]/iij-feed.xml";
print $fh $feed->as_xml;
close $fh;
open my $fh2, ">", "/home/[your home]/tools/last_insert_id.ini";
print $fh2 $article_id;
close $fh;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment