Skip to content

Instantly share code, notes, and snippets.

@taichunmin
Created March 21, 2014 08:34
Show Gist options
  • Select an option

  • Save taichunmin/9682027 to your computer and use it in GitHub Desktop.

Select an option

Save taichunmin/9682027 to your computer and use it in GitHub Desktop.
A web crawler for npo.org.tw
<?php
set_time_limit(0);
$fh = fopen('list.txt','w');
if(!$fh)
exit("write file error!");
$firstPage = file_get_contents("http://www.npo.org.tw/npolist_list.asp?nowpage=1&npost=&keyword2=");
preg_match('/nowpage=(\d+)[^"]*">最末頁/', $firstPage, $match);
$pageCount = $match[1];
// print_r($match);
//detailId($firstPage);
// var_export(getDetail(6493));
$buffer = array();
for($i=1;$i<=$pageCount;$i++)
{
foreach(detailId($i) as $id)
{
$tmp1 = getDetail($id);
if( is_array($tmp1) )
$buffer[] = $tmp1;
}
}
$head = array();
foreach( $buffer as $iv )
$head = array_unique( array_merge( $head, array_keys($iv)));
$head = array_filter($head);
fputcsv($fh, $head);
foreach( $buffer as $iv )
{
$tmp = array_flip($head);
foreach( array_keys($tmp) as $jk )
$tmp[$jk] = @$iv[ $jk ] ?: '';
fputcsv($fh, $tmp);
}
@fclose($fh);
function detailId($pageNo)
{
$listHtml = file_get_contents("http://www.npo.org.tw/npolist_list.asp?nowpage=$pageNo&npost=&keyword2=");
preg_match_all('/npolist_detail\.asp\?id=(\d+)/',$listHtml,$match);
// print_r($match[1]);
return $match[1];
}
function getDetail($id)
{
usleep(rand(100,300));
$html = file_get_contents('http://www.npo.org.tw/npolist_detail.asp?id='.$id);
if( empty($html) )
return null;
preg_match('/聯絡資訊.*?(<table.*?<\/table>).*?基本資料.*?(<table.*?<\/table>)/s', $html, $match);
return array_merge(array('URL'=>'http://www.npo.org.tw/npolist_detail.asp?id='.$id),tableToArray($match[1]),tableToArray($match[2]));
}
function tableToArray($tableHtml)
{
$tableHtml = preg_replace(array('/<br[^>]*>/', '/&nbsp;/','/<iframe.*?<\/iframe>/'), array(';',' ',' ') , $tableHtml);
$tableHtml = htmlspecialchars_decode($tableHtml);
// var_export($tableHtml);
preg_match_all('@<th.*?>(.*?)</th>.*?<td.*?>(.*?)</td>@s', $tableHtml, $match);
// var_export($match);
$array=array_combine($match[1], $match[2]);
foreach ($array as $key => $value ) {
$array[$key]=trim(strip_tags($value));
}
return $array;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment