Skip to content

Instantly share code, notes, and snippets.

@showsky
Created March 20, 2015 15:37
Show Gist options
  • Select an option

  • Save showsky/1fbd8ba1f7b8398bd8a1 to your computer and use it in GitHub Desktop.

Select an option

Save showsky/1fbd8ba1f7b8398bd8a1 to your computer and use it in GitHub Desktop.
Parse PTT data
<?php
define(URL_BASE, 'https://www.ptt.cc');
define(URL_INDEX, URL_BASE . '/bbs/Badminton/index.html');
define(MAX_LIMIT, 300);
define(KEY_WROD, '[揪人]');
$result = array();
function get_html($url) {
return file_get_contents($url);
}
function parse($url) {
global $result;
if ($url === URL_BASE || count($result) > MAX_LIMIT) {
return;
} else {
printf("Parse url: %s\n", $url);
$html = get_html($url);
get_list($html);
$pattern = '/<a class="btn wide" href="(.+)">&lsaquo; 上頁<\/a>/';
preg_match_all($pattern, $html, $matches);
sleep(0.5);
parse(URL_BASE . $matches[1][0]);
}
}
function get_list($content) {
global $result;
$pattern = '/<div class="title">\s*<a href="(.+)">(.+)<\/a>\s*<\/div>/i';
if (preg_match_all($pattern, $content, $matches) !== FALSE) {
if ($matches[1] !== NULL && $matches[2] !== NULL) {
$size = count($matches[1]);
for ($i = 0; $i < $size; $i++) {
if (filter_key_word($matches[2][$i])) {
$data = array(
'title' => $matches[2][$i],
'url' => URL_BASE . $matches[1][$i]
);
array_push($result, $data);
}
}
}
}
}
function filter_key_word($content) {
$result = strpos($content, KEY_WROD);
return ($result !== FALSE) ? TRUE : FALSE;
}
function get_content() {
// TODO:
}
//============ start =============
parse(URL_INDEX);
var_dump($result);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment