Skip to content

Instantly share code, notes, and snippets.

@sotarok
Created March 30, 2010 16:55
Show Gist options
  • Select an option

  • Save sotarok/349295 to your computer and use it in GitHub Desktop.

Select an option

Save sotarok/349295 to your computer and use it in GitHub Desktop.
<?php
/**
* HatenaGroupDumper
*
* 失敗はほぼ考慮しない適当スクリプト
* はてなグループの日記をHTMLで保存します
*/
require_once 'HTTP/Client.php';
// arg check
if ($argc != 4) {
echo <<<E
HatenaGroupDumper:
This is a tiny script to download Hatena Group Diary as HTML file.
Usage:
% php HatenaGroupDumper.php login_name group_name user_name
E;
exit();
}
// get password
$user = $argv[1];
echo "Login with '{$user}', enter password: ";
$password = trim(fgets(STDIN));
define('SAVE_DIR_BASE', dirname(__FILE__) . '/data');
define('SAVE_DIR', SAVE_DIR_BASE . '/' . $argv[3]);
define('URL_HATENA', "http://www.hatena.ne.jp/");
define('URL_GROUP_BASE', "http://{$argv[2]}.g.hatena.ne.jp");
define('URL_TARGET_BASE', "http://{$argv[2]}.g.hatena.ne.jp/{$argv[3]}/archive");
try {
if (!is_dir(SAVE_DIR_BASE)) {
if (!mkdir(SAVE_DIR_BASE)) {
throw new Exception("Cannot create save dir: " . SAVE_DIR_BASE);
}
}
if (!is_dir(SAVE_DIR)) {
if (!mkdir(SAVE_DIR)) {
throw new Exception("Cannot create save dir: " . SAVE_DIR);
}
}
$c = new HTTP_Client();
$c->get(URL_HATENA . 'login');
$c->post(URL_HATENA . 'login', array('name' => $user, 'password' => $password, 'auto_login' => 1,));
$c->get(URL_TARGET_BASE);
$res = $c->currentResponse();
if ($res['code'] !== 200) {
throw new Exception("archive get error");
}
$body = @simplexml_import_dom(DOMDocument::loadHTML($res['body']));
foreach ($body->xpath("id('archive-calendar-top')//a[@class='month']") as $mon) {
$url = (string)$mon->attributes()->href;
$mon_label = end(explode('/', $url));
echo "$mon_label :", PHP_EOL;
$c->get(URL_TARGET_BASE . '/' . $mon_label);
$res = $c->currentResponse();
if ($res['code'] !== 200) {
echo "\t{$mon_label} Skipped... (some HTTP error) ", PHP_EOL;
continue;
}
$diary_body = @simplexml_import_dom(DOMDocument::loadHTML($res['body']));
foreach ($diary_body->xpath("//li[@class='archive archive-date']/a") as $diary_url) {
$diary_real_url = URL_GROUP_BASE . $diary_url->attributes()->href;
echo "\t{$diary_real_url} ... ";
$c->get($diary_real_url);
$res_diary = $c->currentResponse();
if ($res_diary['code'] !== 200) {
echo " some error, skipped. ", PHP_EOL;
continue;
}
$fn = SAVE_DIR . '/' . $mon_label . '-' . (string)$diary_url . '.html';
file_put_contents($fn, $res_diary['body']);
echo " done.", PHP_EOL;
sleep(1);
}
}
} catch (Exception $e) {
echo "Some Exception!!", PHP_EOL;
echo $e->getMessage(), PHP_EOL;
}
@wozozo
Copy link

wozozo commented Mar 31, 2010

test

@wozozo
Copy link

wozozo commented Mar 31, 2010

????(^o^)?

@wozozo
Copy link

wozozo commented Mar 31, 2010

mojibake ta.

@sotarok
Copy link
Author

sotarok commented Mar 31, 2010

こめんと

@sotarok
Copy link
Author

sotarok commented Mar 31, 2010

あれ?文字化けしないよ?

@wozozo
Copy link

wozozo commented Mar 31, 2010

こめんと

@wozozo
Copy link

wozozo commented Mar 31, 2010

おお。直ってるw

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment