Created
November 16, 2017 07:12
-
-
Save imdong/e333581c3ce2b25940ee7e67f6e947ee to your computer and use it in GitHub Desktop.
cURL网页抓取
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* 名称:cURL网页抓取 | |
* 版本:v0.3 | |
* 作者:吣碎De人(http://www.qs5.org) | |
* 最后更新时间:2013年2月4日 | |
* 获取更新:http://www.qs5.org/ | |
* | |
*/ | |
//使用方法: | |
/* | |
$_Url = "http://www.baidu.com"; | |
$_Data = "u=admin&p=123456"; | |
$_Cookies = "0a63b_lastvisit=176%091359981539%09%2Flogin.php; 0a63b_winduser=BlEOUFpoCgUAAgAHWlVSDQZUCgMOUQcABwgAClFXUQFfCABTVlow; 0a63b_ck_info=%2F%09; 0a63b_lastvisit=deleted"; | |
$Proxy = array("Proxy" => "124.160.133.2:80", "UserNmae" => "Root", "PassWord" => "Root"); | |
$Head = array("User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)", "Accept-Language: en-us"); | |
// 地址 访问方式 Post数据 | |
$_Str = Get_Web_Contents($_Url, "GET", $_Data, $_Cookies, $Proxy, 30, $Head); | |
print_r($_Str); | |
*/ | |
function Get_Web_Contents($_Get_Url, $_Method = "GET", $_Form_Data = "", $_Cookie = "", $_Headers = array(), $_Proxy = array("Proxy" => ""), $_Time_Out = 30){ | |
$ch = curl_init(); //创建cURL对象 | |
curl_setopt($ch, CURLOPT_URL, $_Get_Url); //设置读取URL | |
curl_setopt($ch, CURLOPT_HEADER, 1); //是否输出头信息,0为不输出,非零则输出 | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //设置输出方式, 0为自动输出返回的内容, 1为返回输出的内容,但不自动输出. | |
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $_Time_Out); // 设置超时 30秒 | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); | |
// 设置代理 | |
if(isset($_Proxy["Proxy"])){ | |
curl_setopt($ch, CURLOPT_PROXY, $_Proxy["Proxy"]); //设置代理地址 | |
if(isset($_Proxy["UserNmae"]) and isset($_Proxy["PassWord"])){ | |
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $_Proxy["UserNmae"].":".$_Proxy["PassWord"]); // 设置代理用户名与密码 | |
} | |
} | |
// 设置 POST 数据 | |
if(strtoupper($_Method) == "POST"){ | |
curl_setopt($ch, CURLOPT_POST, 1); //设置为 POST 提交 | |
curl_setopt($ch, CURLOPT_POSTFIELDS, $_Form_Data); //设置POST数据 | |
} | |
// 设置 Cookies 数据 | |
if(strlen($_Cookie)){ | |
curl_setopt($ch, CURLOPT_COOKIE, $_Cookie); // 设置 Cookies | |
} | |
// 设置附加协议头 | |
if(isset($_Headers)){ | |
//设置 User-Agent | |
if(isset($_Headers['User-Agent'])){ | |
curl_setopt($ch, CURLOPT_USERAGENT, $_Headers['User-Agent']); | |
} | |
curl_setopt($ch, CURLOPT_HTTPHEADER, $_Headers); // 设置附加协议头 | |
} | |
@$html = curl_exec($ch); //执行 | |
if ($html === False) { //获取错误, | |
$ret["Error"] = curl_error($ch); | |
return $ret; | |
} | |
$ret["Info"] = curl_getinfo($ch); //获取详细信息 | |
// curl_close($ch);//关闭对象 | |
// 区分头信息与正文 | |
$_wz = strpos($html,"\r\n\r\n"); | |
$ret["Header"] = substr($html,0,$_wz); //截取头信息 | |
if($ret["Header"] == "HTTP/1.1 100 Continue"){ | |
$html = substr($html,$_wz+4); | |
$_wz = strpos($html, "\r\n\r\n"); | |
$ret["Header"] = substr($html,0,$_wz); //截取头信息 | |
} | |
// 获取Cookies 信息 | |
if(preg_match_all("/set-cookie:\s?(.*?=.*?);/i", $ret["Header"], $cookie)){ | |
$cookie = $cookie[1]; | |
} | |
$ret["Cookies"] = ""; | |
foreach ($cookie as $value){ | |
if(!is_array($value)){ | |
$ret["Cookies"].= $value."; "; | |
} | |
} | |
$ret["Cookies"] = substr($ret["Cookies"],0,-1); | |
$ret["Body"] = substr($html,$_wz+4); //获取正文 | |
return $ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment