Skip to content

Instantly share code, notes, and snippets.

@JEEN
Created November 5, 2009 11:54
Show Gist options
  • Save JEEN/226999 to your computer and use it in GitHub Desktop.
Save JEEN/226999 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use Config::Pit;
use WWW::Mechanize;
use Web::Scraper;
use XML::Feed;
use DateTime;
use DateTime::Format::MySQL;
use Cache::File;
use Data::Dumper;
use HTML::Entities;
use utf8;
my $config = pit_get("www.daum.net", require => {
id => "your id on example",
pw => "your pw on example",
});
my $url = "http://www.daum.net/";
my $mech = WWW::Mechanize->new();
$mech->get($url);
my $res = $mech->submit_form(
form_name => 'loginform',
fields => { id => $config->{id}, pw => $config->{pw} },
);
my $feed = XML::Feed->new('Atom');
$feed->title("DYM");
$feed->link("http://cafe.daum.net/japantokyo");
$feed->description("");
$feed->author("DYM Members");
$feed->id("tag:cafe.daum.net:dym");
$feed->modified(DateTime::Format::W3CDTF->parse_datetime(DateTime->now( time_zone => 'Asia/Tokyo' ) . '+09:00'));
open my $fw, "<", "/home/leejj/tools/last_insert_id2.ini" or die $!;
my $ini = join("", <$fw>);
$ini =~ s/[\r\n]//g;;
close $fw;
my $cache = Cache::File->new(
cache_root => "/tmp/atomgen",
default_expires => '1 hour',
) or die $!;
my $article_id = $ini - 19;
my $passed = 0;
while($article_id++ >= 0) {
my $uri = sprintf("http://cafe308.daum.net/_c21_/bbs_read?grpid=sT2&fldid=5lEZ&datanum=%s", $article_id);
my $c;
unless ($cache->exists($uri)) {
$mech->get($uri);
$c = scraper {
process 'xmp[id="template_xmp"]', content => 'HTML';
process 'div[class="subject"]', title => 'TEXT';
process 'div[class="article_writer"]>a', author => 'TEXT';
process 'div[class="article_writer"]>a+span+span+span+span', registered_at => 'TEXT';
}->scrape($mech->content);
unless ($c->{title}) {
next if $article_id < $ini;
$passed++;
if ($passed > 1) {
--$article_id;
last;
}
next;
}
$c->{content} =~ s/x?x?\[안내\]태그제한으로등록되지않습니다-//g;
$c->{content} = decode_entities($c->{content});
$cache->freeze($uri, $c);
} else {
$c = $cache->thaw($uri);
}
next unless $c->{title};
next if $c->{title} eq 'Untitled';
my $row = {
articleId => $article_id,
title => $c->{title},
author => $c->{author},
registered_at => $c->{registered_at},
content => $c->{content},
};
my $entry = XML::Feed::Entry->new('Atom');
my $content = XML::Feed::Content->new({ body => $row->{content} });
$entry->title($row->{title});
$entry->link("http://cafe.daum.net/japantokyo/5lEZ/$article_id");
$entry->summary($row->{content});
$entry->author($row->{author});
my $date = $row->{registered_at};
my ($yy, $mm, $dd, $hh, $ii) = $date =~ /^(..)\.(.{1,2})\.(.{1,2}) (..):(..)$/;
eval {
$entry->issued(DateTime::Format::W3CDTF->parse_datetime(DateTime::Format::MySQL->parse_datetime(sprintf("20%02d-%02d-%02d %02d:%02d:00", $yy, $mm, $dd, $hh, $ii)) . '+09:00'));
};
if ($@) { print $row->{registered_at}; }
$entry->content($content);
$feed->add_entry($entry);
}
open my $fh, ">", "/home/leejj/dym-feed.xml";
print $fh $feed->as_xml;
close $fh;
open my $fh2, ">", "/home/leejj/tools/last_insert_id2.ini";
print $fh2 $article_id;
close $fh;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment