Skip to content

Instantly share code, notes, and snippets.

@cloudsben
Last active December 15, 2015 05:39
Show Gist options
  • Save cloudsben/5210427 to your computer and use it in GitHub Desktop.
Save cloudsben/5210427 to your computer and use it in GitHub Desktop.
/* 获取指定URL的HTML,如果页面是非UTF-8编码,会自动转换为UTF-8编码 若指定网址的content-type不是text/html,则不会抓取网页内容 return: array url: 若有301/302,返回跳转后的url html: base_url: error: */
<?php
if ( ! function_exists('http_get_utf8_html'))
{
function http_get_utf8_html($url,$from_encoding='')
{
$result = array();
$result['url'] = $url;
$result['html'] = '';
$header = get_headers_by_curl($url);
$url = $header['url'];
/* 很多页面在用HEAD方法只获取HTTP头是会返回404,因此注释此段
if($header['http_code']!=200)
{
$result['error'] = 'http_code:'.$header['http_code'];
return $result;
}
*/
$content_type = trim($header['content_type']);
$encoding = null;
if($content_type)
{
preg_match('/^\s*([^;]+)(?:;\s*charset=(.*))?/im', $content_type, $match);
$ctype = $match[1];
if(count($match)>2)
{
$encoding = strtolower(trim($match[2], '"\''));
}
}
if(strpos($ctype,'text')===false and strpos($ctype,'xml')===false)
{
$result['error'] = 'content-type:'.$ctype;
return $result;
}
$header = array(
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.3',
);
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($ch, CURLOPT_VERBOSE, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,false); //是否抓取跳转后的页面
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT, USER_AGENT);
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
$body = curl_exec($ch);
$headers = curl_getinfo($ch);
curl_close($ch);
if($headers['http_code']!=200)
{
$result['error'] = 'http_code:'.$headers['http_code'];
return $result;
}
if($body)
{
if (!$encoding and !$from_encoding) {
if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $body, $match)) {
$encoding = strtolower(trim($match[1], '"\''));
}
if (preg_match('/<meta.+?content="([^;]+)(?:;\s*charset=["]?([^"^\s]*))?"/i', $body, $match))
{
$encoding = strtolower(trim($match[2], '"\''));
}
}
if($from_encoding)
{
$encoding = $from_encoding;
}
if($encoding=='gb2312') $encoding = 'gbk';
//转换页面编码
if ($encoding != 'utf-8' and in_array($encoding,array('gbk','gb2312')) ) {
$body = mb_convert_encoding($body, 'utf-8', $encoding);
}
$result['url'] = $url;
$base_url = preg_replace('#/[^/]*$#', '', $url);
if(substr($base_url,-1,1)!="/") $base_url.='/';
//转换页面链接来源为绝对地址
$body = make_all_url_absolute($body,$base_url);
$result['html'] = $body;
$result['base_url']= $base_url;
}
return $result;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment