cloudsben · December 15, 2015 05:39
diff --git a/http_get_utf8_html.php b/http_get_utf8_html.php
 <?php
 if ( ! function_exists('http_get_utf8_html'))
 {
  function http_get_utf8_html($url,$from_encoding='')
 	{
 		$result = array();
 		$result['url'] = $url;
 		$result['html'] = '';		
 		$header = get_headers_by_curl($url);

 		$url = $header['url'];
 	
 		/*  很多页面在用HEAD方法只获取HTTP头是会返回404，因此注释此段
 		if($header['http_code']!=200)
 		{
 			$result['error'] = 'http_code:'.$header['http_code'];	
 			return $result;
 		}
 		*/
 		$content_type = trim($header['content_type']);
 		$encoding = null;
 	
 	  if($content_type)
 		{
 				preg_match('/^\s*([^;]+)(?:;\s*charset=(.*))?/im', $content_type, $match);
 				$ctype = $match[1];
 				if(count($match)>2)
 				{
 						$encoding =  strtolower(trim($match[2], '"\''));
 				}
 			
 		}
 	
 		if(strpos($ctype,'text')===false and strpos($ctype,'xml')===false)
 		{
 			$result['error'] = 'content-type:'.$ctype;
 			return $result;
 		}
 	
 		$header = array(
 		    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 		    'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 		);

 		$ch = curl_init($url);
 		curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
 		curl_setopt($ch, CURLOPT_VERBOSE, false);
 		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 		curl_setopt($ch, CURLOPT_FOLLOWLOCATION,false);  //是否抓取跳转后的页面 
 		curl_setopt($ch, CURLOPT_HEADER, false);  
 		curl_setopt($ch, CURLOPT_USERAGENT, USER_AGENT);
 		curl_setopt($ch, CURLOPT_TIMEOUT,        15); 
 		curl_setopt($ch, CURLOPT_HTTPHEADER, $header);	
 		$body = curl_exec($ch);
 		$headers = curl_getinfo($ch);	
 		curl_close($ch);
 		if($headers['http_code']!=200)
 		{
 			$result['error'] = 'http_code:'.$headers['http_code'];	
 			return $result;
 		}	
 		if($body)
 		{
 			if (!$encoding and !$from_encoding) {

 			    if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $body, $match)) {
 			        $encoding = strtolower(trim($match[1], '"\''));
 			    }
 					if (preg_match('/<meta.+?content="([^;]+)(?:;\s*charset=["]?([^"^\s]*))?"/i', $body, $match))	
 					{
 							$encoding = strtolower(trim($match[2], '"\''));
 					}
 			}
 			if($from_encoding)
 			{
 					$encoding = $from_encoding;
 			}

 			if($encoding=='gb2312') $encoding = 'gbk';

 			//转换页面编码
 			if ($encoding != 'utf-8' and in_array($encoding,array('gbk','gb2312')) ) {
 			    $body = mb_convert_encoding($body, 'utf-8', $encoding);
 			}
 	
 		
 		
 			$result['url'] = $url;
 			$base_url = preg_replace('#/[^/]*$#', '', $url);	
 			if(substr($base_url,-1,1)!="/") $base_url.='/';
 		
 			//转换页面链接来源为绝对地址				
 			$body = make_all_url_absolute($body,$base_url);
 		
 			$result['html'] = $body;

 			$result['base_url']= $base_url;
 					
 		}

 		return $result;
 	
 	}	
 }
	<?php
	if ( ! function_exists('http_get_utf8_html'))
	{
	function http_get_utf8_html($url,$from_encoding='')
	{
	$result = array();
	$result['url'] = $url;
	$result['html'] = '';
	$header = get_headers_by_curl($url);

	$url = $header['url'];

	/* 很多页面在用HEAD方法只获取HTTP头是会返回404，因此注释此段
	if($header['http_code']!=200)
	{
	$result['error'] = 'http_code:'.$header['http_code'];
	return $result;
	}
	*/
	$content_type = trim($header['content_type']);
	$encoding = null;

	if($content_type)
	{
	preg_match('/^\s([^;]+)(?:;\scharset=(.*))?/im', $content_type, $match);
	$ctype = $match[1];
	if(count($match)>2)
	{
	$encoding = strtolower(trim($match[2], '"\''));
	}

	}

	if(strpos($ctype,'text')===false and strpos($ctype,'xml')===false)
	{
	$result['error'] = 'content-type:'.$ctype;
	return $result;
	}

	$header = array(
	'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.3',
	);

	$ch = curl_init($url);
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
	curl_setopt($ch, CURLOPT_VERBOSE, false);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION,false); //是否抓取跳转后的页面
	curl_setopt($ch, CURLOPT_HEADER, false);
	curl_setopt($ch, CURLOPT_USERAGENT, USER_AGENT);
	curl_setopt($ch, CURLOPT_TIMEOUT, 15);
	curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
	$body = curl_exec($ch);
	$headers = curl_getinfo($ch);
	curl_close($ch);
	if($headers['http_code']!=200)
	{
	$result['error'] = 'http_code:'.$headers['http_code'];
	return $result;
	}
	if($body)
	{
	if (!$encoding and !$from_encoding) {

	if (preg_match('/^<\?xml\s+version=(?:"[^"]"\|\'[^\']\')\s+encoding=("[^"]"\|\'[^\']\')/s', $body, $match)) {
	$encoding = strtolower(trim($match[1], '"\''));
	}
	if (preg_match('/<meta.+?content="([^;]+)(?:;\scharset=["]?([^"^\s]))?"/i', $body, $match))
	{
	$encoding = strtolower(trim($match[2], '"\''));
	}
	}
	if($from_encoding)
	{
	$encoding = $from_encoding;
	}

	if($encoding=='gb2312') $encoding = 'gbk';

	//转换页面编码
	if ($encoding != 'utf-8' and in_array($encoding,array('gbk','gb2312')) ) {
	$body = mb_convert_encoding($body, 'utf-8', $encoding);
	}



	$result['url'] = $url;
	$base_url = preg_replace('#/[^/]*$#', '', $url);
	if(substr($base_url,-1,1)!="/") $base_url.='/';

	//转换页面链接来源为绝对地址
	$body = make_all_url_absolute($body,$base_url);

	$result['html'] = $body;

	$result['base_url']= $base_url;

	}

	return $result;

	}
	}