Created
March 30, 2010 16:55
-
-
Save sotarok/349295 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * HatenaGroupDumper | |
| * | |
| * 失敗はほぼ考慮しない適当スクリプト | |
| * はてなグループの日記をHTMLで保存します | |
| */ | |
| require_once 'HTTP/Client.php'; | |
| // arg check | |
| if ($argc != 4) { | |
| echo <<<E | |
| HatenaGroupDumper: | |
| This is a tiny script to download Hatena Group Diary as HTML file. | |
| Usage: | |
| % php HatenaGroupDumper.php login_name group_name user_name | |
| E; | |
| exit(); | |
| } | |
| // get password | |
| $user = $argv[1]; | |
| echo "Login with '{$user}', enter password: "; | |
| $password = trim(fgets(STDIN)); | |
| define('SAVE_DIR_BASE', dirname(__FILE__) . '/data'); | |
| define('SAVE_DIR', SAVE_DIR_BASE . '/' . $argv[3]); | |
| define('URL_HATENA', "http://www.hatena.ne.jp/"); | |
| define('URL_GROUP_BASE', "http://{$argv[2]}.g.hatena.ne.jp"); | |
| define('URL_TARGET_BASE', "http://{$argv[2]}.g.hatena.ne.jp/{$argv[3]}/archive"); | |
| try { | |
| if (!is_dir(SAVE_DIR_BASE)) { | |
| if (!mkdir(SAVE_DIR_BASE)) { | |
| throw new Exception("Cannot create save dir: " . SAVE_DIR_BASE); | |
| } | |
| } | |
| if (!is_dir(SAVE_DIR)) { | |
| if (!mkdir(SAVE_DIR)) { | |
| throw new Exception("Cannot create save dir: " . SAVE_DIR); | |
| } | |
| } | |
| $c = new HTTP_Client(); | |
| $c->get(URL_HATENA . 'login'); | |
| $c->post(URL_HATENA . 'login', array('name' => $user, 'password' => $password, 'auto_login' => 1,)); | |
| $c->get(URL_TARGET_BASE); | |
| $res = $c->currentResponse(); | |
| if ($res['code'] !== 200) { | |
| throw new Exception("archive get error"); | |
| } | |
| $body = @simplexml_import_dom(DOMDocument::loadHTML($res['body'])); | |
| foreach ($body->xpath("id('archive-calendar-top')//a[@class='month']") as $mon) { | |
| $url = (string)$mon->attributes()->href; | |
| $mon_label = end(explode('/', $url)); | |
| echo "$mon_label :", PHP_EOL; | |
| $c->get(URL_TARGET_BASE . '/' . $mon_label); | |
| $res = $c->currentResponse(); | |
| if ($res['code'] !== 200) { | |
| echo "\t{$mon_label} Skipped... (some HTTP error) ", PHP_EOL; | |
| continue; | |
| } | |
| $diary_body = @simplexml_import_dom(DOMDocument::loadHTML($res['body'])); | |
| foreach ($diary_body->xpath("//li[@class='archive archive-date']/a") as $diary_url) { | |
| $diary_real_url = URL_GROUP_BASE . $diary_url->attributes()->href; | |
| echo "\t{$diary_real_url} ... "; | |
| $c->get($diary_real_url); | |
| $res_diary = $c->currentResponse(); | |
| if ($res_diary['code'] !== 200) { | |
| echo " some error, skipped. ", PHP_EOL; | |
| continue; | |
| } | |
| $fn = SAVE_DIR . '/' . $mon_label . '-' . (string)$diary_url . '.html'; | |
| file_put_contents($fn, $res_diary['body']); | |
| echo " done.", PHP_EOL; | |
| sleep(1); | |
| } | |
| } | |
| } catch (Exception $e) { | |
| echo "Some Exception!!", PHP_EOL; | |
| echo $e->getMessage(), PHP_EOL; | |
| } |
????(^o^)?
mojibake ta.
Author
こめんと
Author
あれ?文字化けしないよ?
こめんと
おお。直ってるw
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
test