Last active
August 29, 2015 14:06
-
-
Save mgng/7a9d6fd7224e68e9db87 to your computer and use it in GitHub Desktop.
twitpicからダウンロードするやつ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// twitpic ダウンロードするやつ | |
// 使い方は $screen_name を保存したいアカウント名に変更してコンソールから以下コマンド実行 | |
// php twitpic_downloader.php | |
set_time_limit( 0 ); | |
// 設定 | |
$screen_name = "mugng"; // twitpicのアカウント名 | |
$save_dir = "./imgs_{$screen_name}/"; // 保存先ディレクトリ | |
$result_text = "./result_{$screen_name}.txt"; // 結果出力用テキスト | |
$base_url = "http://twitpic.com/photos/{$screen_name}"; | |
$wait = 1; // wait | |
// 画像保存先ディレクトリと結果出力テキスト作成 | |
mkdir( $save_dir ); | |
touch( $result_text ); | |
// 最初にページ取得してDOMDocument作成 | |
$html = request( $base_url ); | |
$dom = createDOMDocument( $html ); | |
// xpathで最後のページ番号取得 | |
$start = 1; | |
$last = 1; | |
$xpath = new \DOMXPath( $dom ); | |
$nodes = $xpath->query( "//p[contains(concat(' ', @class, ' '),' pagination ')]/a" ); | |
foreach( $nodes as $node ) { | |
$href = str_replace( "?page=", "", $node->getAttribute( "href" ) ); | |
$last = (int)$href; | |
} | |
// 各ページごと処理 | |
for( $i=$start; $i<=$last; $i++ ) { | |
echo "### page {$i} start.\n"; | |
// 現在timestamp | |
$now = time(); | |
// DOMDocument作成してxpath検索 | |
$html = request( "{$base_url}?page={$i}" ); | |
$dom = createDOMDocument( $html ); | |
$xpath = new \DOMXPath( $dom ); | |
$nodes = $xpath->query( "//div[contains(concat(' ', @class, ' '), ' user-photo-wrap ')]" ); | |
foreach( $nodes as $node ) { | |
// ルートからの nodePath | |
$node_path = $node->getNodePath(); | |
// 画像id取得 | |
$id = preg_replace( "/\A\//u", "", $xpath->evaluate( "{$node_path}/div[contains(concat(' ', @class, ' '), ' user-photo ')]/a", $dom )->item(0)->getAttribute("href") ); | |
// 拡張子取得 | |
$ext = "jpg"; | |
preg_match( "/\d+\.([a-z]+)\?\d+\z/ui", $xpath->evaluate( "{$node_path}/div[contains(concat(' ', @class, ' '), ' user-photo ')]/a/img", $dom )->item(0)->getAttribute("src"), $m ); | |
if ( isset( $m[1] ) ) { | |
$ext = $m[1]; | |
} | |
// メッセージ取得 | |
$message = $xpath->evaluate( "{$node_path}//p[contains(concat(' ', @class, ' '), ' message ')]", $dom )->item(0)->nodeValue; | |
// 画像アップロードしただいたいの日時取得 | |
$date_str = preg_replace( "/(\Aabout\s*)|(\s*via\s*.+?\z)/ui", "", $xpath->evaluate( "{$node_path}//ul[contains(concat(' ', @class, ' '), ' inline-list ')]/li[1]", $dom )->item(0)->nodeValue ); | |
$created_date = date( "YmdHis", strtotime( $date_str, $now ) ); | |
// 画像取得 | |
$img_src = file_get_contents( "http://twitpic.com/show/full/{$id}" ); | |
// 画像保存 | |
$status = "success"; | |
if ( $img_src ) { | |
file_put_contents( "{$save_dir}{$created_date}_{$id}.{$ext}", $img_src ); | |
} else { | |
$status = "failed"; | |
} | |
// 結果出力 | |
file_put_contents( $result_text, "{$status}\t{$id}\t{$created_date}\t{$date_str}\t{$message}\n", FILE_APPEND | LOCK_EX ); | |
echo "{$id} save {$status}.\n"; | |
} | |
sleep( $wait ); | |
} | |
//----------- function | |
function request( $url ) { | |
$header = implode( "\r\n", array( | |
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0", | |
"Referer: http://twitpic.com/", | |
) ); | |
return file_get_contents( $url, false, stream_context_create(array( | |
"http" => array( | |
"method" => "GET", | |
"header" => $header, | |
) | |
))); | |
} | |
function createDOMDocument( $html ) { | |
$dom = new \DOMDocument(); | |
libxml_use_internal_errors( true ); | |
$dom->loadHTML( $html ); | |
libxml_clear_errors(); | |
return $dom; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment