Skip to content

Instantly share code, notes, and snippets.

@nhnmomonga
Last active December 25, 2015 12:39
Show Gist options
  • Save nhnmomonga/6978124 to your computer and use it in GitHub Desktop.
Save nhnmomonga/6978124 to your computer and use it in GitHub Desktop.
TagCrawler
<?php
class TagCrawler {
private $api_url;
private $target_tags;
private $output_file_name;
const PAGE_MAX = 200;
const MAX_RECORD_PAR_PAGE = 50;
const WORK_ID = 0;
const USER_ID = 1;
const TITLE = 3;
const DISPLAYED_USER_NAME = 5;
const POST_DATE = 12;
const TAGS =13;
const VOTE_NUM = 15;
const VOTE_POINT = 16;
const TRAFFIC = 17;
const CAPTION = 18;
const PAGE_NUM = 19;
public function __construct($input_url, $input_tag, $input_file_name) {
$this->api_url = $input_url;
$this->target_tags = $input_tag;
$this->output_file_name = $input_file_name;
}
private function requestPageByTag($page) {
$url = $this->api_url . "&word=" . $this->target_tags . "&p=" .$page;
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, False);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, True);
$response = curl_exec($ch);
curl_close($ch);
return $response;
}
private function makeOutputData($response_body) {
$result_array = array();
$exploded_arrays = explode("\n", $response_body);
for ($i = 0; $i < self::MAX_RECORD_PAR_PAGE; $i++) {
$exploded_array = explode(",", $exploded_arrays[$i]);
$result_array[] = implode(",", array(
rtrim(ltrim($exploded_array[self::WORK_ID], "\""), "\""),
rtrim(ltrim($exploded_array[self::USER_ID], "\""), "\""),
rtrim(ltrim($exploded_array[self::TITLE], "\""), "\""),
rtrim(ltrim($exploded_array[self::DISPLAYED_USER_NAME], "\""), "\""),
rtrim(ltrim($exploded_array[self::POST_DATE], "\""), "\""),
rtrim(ltrim($exploded_array[self::TAGS], "\""), "\""),
rtrim(ltrim($exploded_array[self::VOTE_NUM], "\""), "\""),
rtrim(ltrim($exploded_array[self::VOTE_POINT], "\""), "\""),
rtrim(ltrim($exploded_array[self::TRAFFIC], "\""), "\""),
rtrim(ltrim($exploded_array[self::CAPTION], "\""), "\""),
rtrim(ltrim($exploded_array[self::PAGE_NUM], "\""), "\"")
)
);
}
var_dump($result_array);
return $result_array;
}
public function exec() {
$fp = fopen($this->output_file_name, "a+");
for ($i = 1; $i <= self::PAGE_MAX; $i++) {
$response_body = $this->requestPageByTag($i);
$maked_datas = $this->makeOutputData($response_body);
foreach ($maked_datas as $maked_data) {
fwrite($fp, $maked_data);
fwrite($fp, PHP_EOL);
}
}
fclose($fp);
}
}
$crawler = new TagCrawler($argv[1], $argv[2], $argv[3]);
$crawler->exec();
@nhnmomonga
Copy link
Author

ページがMAXになるまで動き続けるので200ページまで結果が無かった場合,,,,,,,とかカンマだけの行が出力されることになるので注意

@nhnmomonga
Copy link
Author

でも後でuniqコマンドとかで取り除けばまあ大丈夫

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment