Created
October 23, 2013 10:54
-
-
Save luxixing/7116529 to your computer and use it in GitHub Desktop.
php-curl-multi
使用php curl多进程请求url,抓取页面的类
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Use php curl multi, rolling request url. | |
* | |
* @author [email protected] | |
*/ | |
class CurlRoll | |
{ | |
/** | |
* @var int | |
* 并发请求数,设置此值过大,同一时间内如果请求远端主机会很容易被判定为DDos攻击 | |
*/ | |
private $window_size = 5; | |
/** | |
* @var float | |
* curl_multi_select 处理超时时间. | |
*/ | |
private $timeout = 10; | |
/** | |
* @var array | |
* 请求对象 CurlRequest 实例数组 | |
*/ | |
private $requests = array(); | |
/** | |
* @var array | |
* 并发请求map | |
*/ | |
private $requestMap = array(); | |
/** | |
* @var string|array | |
* callback function,结果处理回调函数. | |
*/ | |
private $callback; | |
/** | |
* @var array | |
* HTTP request default options. | |
*/ | |
private $options = array( | |
CURLOPT_SSL_VERIFYPEER => 0, //不开启https请求 | |
CURLOPT_RETURNTRANSFER => 1, //请求信息以文件流方式返回 | |
CURLOPT_CONNECTTIMEOUT => 10, //连接超时时间 | |
CURLOPT_TIMEOUT => 20, //设置curl执行最大时间 | |
CURLOPT_FOLLOWLOCATION => 1, //curl允许根据response location的值重定向请求 | |
CURLOPT_MAXREDIRS => 5, //CURLOPT_FOLLOWLOCATION为真后,此值设定重定向递归最大次数 | |
CURLOPT_HEADER => 0, //设置为true,请求返回的文件流中就会包含response header | |
CURLOPT_AUTOREFERER => true, //当根据Location重定向时,自动设置header中的referer信息 | |
CURLOPT_ENCODING => "", //HTTP请求头中"Accept-Encoding"的值,为空发送所有支持的编码类型 | |
); | |
/** | |
* @var array | |
* HTTP Request发送的header信息 | |
*/ | |
private $headers = array( | |
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', | |
'Connection: close', | |
'Cache-Control: max-age=0', | |
//'X-FORWARD-FOR:8.8.8.8', //代理ip地址 | |
//'CLIENT-IP:3.3.3.3', //客户端ip,REMOTE_ADDR不为空的情况下,是比较真是ip,不好伪造 | |
); | |
private static $agent = array( | |
//google chrome | |
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36', | |
'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0', | |
//firefox | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', | |
'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', | |
//ie | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', | |
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)', | |
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)', | |
); | |
/** | |
* @param int | |
* $window_size | |
*/ | |
public function __construct($window_size = 5) | |
{ | |
$this->window_size = (int)$window_size ? : 5; | |
} | |
/** | |
* @return void | |
*/ | |
public function __destruct() | |
{ | |
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests); | |
} | |
/** | |
* @param string $name | |
* @return mixed | |
*/ | |
public function __get($name) | |
{ | |
return isset($this->{$name}) ? $this->{$name} : null; | |
} | |
/** | |
* @param string $name | |
* @param mixed $value | |
* @return bool | |
*/ | |
public function __set($name, $value) | |
{ | |
// append the base options & headers | |
if ($name == "options" || $name == "headers") | |
{ | |
$this->{$name} = $value + $this->{$name}; | |
} else | |
{ | |
$this->{$name} = $value; | |
} | |
return true; | |
} | |
/** | |
* Add a request to the request queue | |
* | |
* @param $url | |
* @return bool | |
*/ | |
public function add($url) | |
{ | |
$this->requests[] = $this->createRequest($url, 'GET', $this->headers, $this->options); | |
return true; | |
} | |
/** | |
* Perform GET request | |
* | |
* @param string $url | |
* @param $headers 不是key-value数组,http请求request header部分的内容 | |
* $headers = array( | |
* "POST ".$page." HTTP/1.0", | |
* "Content-type: text/xml;charset=\"utf-8\"", | |
* "Accept: text/xml", | |
* "Cache-Control: no-cache", | |
* "Pragma: no-cache", | |
* "SOAPAction: \"run\"", | |
* "Content-length: ".strlen($xml_data), | |
* "Authorization: Basic " . base64_encode($credentials) | |
* ); | |
* @param $options | |
* @return bool | |
*/ | |
public function get($url, $headers = array(), $options = array()) | |
{ | |
$this->requests[] = $this->createRequest($url, "GET", $headers, $options); | |
return true; | |
} | |
/** | |
* Perform POST request | |
* | |
* @param string $url | |
* @param $post_data | |
* @param $headers | |
* @param $options | |
* @return bool | |
*/ | |
public function post($url, $headers = array(), $options = array(), $post_data) | |
{ | |
$this->requests[] = $this->createRequest($url, "POST", $headers, $options, $post_data); | |
return true; | |
} | |
/** | |
* Execute processing | |
* | |
* @param mixed $callback | |
* @return string|null | |
*/ | |
public function execute($callback = null) | |
{ | |
$ret = null; | |
if ($callback) | |
{ | |
$this->callback = $callback; | |
} | |
if (count($this->requests) == 1) | |
{ | |
$ret = $this->single_curl(); | |
} else | |
{ | |
$ret = $this->rolling_curl(); | |
} | |
//clear all request once time | |
$this->requests = $this->requestMap = array(); | |
return $ret; | |
} | |
/** | |
* Performs a single curl request | |
* | |
* @access private | |
* @return string | |
*/ | |
private function single_curl() | |
{ | |
$ch = curl_init(); | |
$request = array_shift($this->requests); | |
$options = $this->get_options($request); | |
curl_setopt_array($ch, $options); | |
$output = curl_exec($ch); | |
$info = curl_getinfo($ch); | |
if ($this->callback && is_callable($this->callback)) | |
{ | |
$callback = $this->callback; | |
return call_user_func($callback, $output, $info, $request); | |
} else | |
{ | |
return $output; | |
} | |
} | |
/** | |
* Performs multiple curl requests | |
* | |
* @access private | |
* @return bool | |
*/ | |
private function rolling_curl() | |
{ | |
$n = count($this->requests); | |
if ($n < $this->window_size) | |
{ | |
$this->window_size = $n; | |
} | |
if ($this->window_size < 2) | |
{ | |
return false; | |
} | |
$master = curl_multi_init(); | |
// start the first batch of requests | |
//注意变量i的作用域不是for循环体内,在后续还是可以使用的 | |
for($i = 0; $i < $this->window_size; $i++) | |
{ | |
$ch = curl_init(); | |
$options = $this->get_options($this->requests[$i]); | |
curl_setopt_array($ch, $options); | |
curl_multi_add_handle($master, $ch); | |
$key = (string)$ch; | |
$this->requestMap[$key] = $i; | |
} | |
do | |
{ | |
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ; | |
if ($execrun != CURLM_OK) | |
{ | |
break; | |
} | |
// a request was just completed -- find out which one | |
while ($done = curl_multi_info_read($master)) | |
{ | |
// get the info and content returned on the request | |
$info = curl_getinfo($done['handle']); | |
$output = curl_multi_getcontent($done['handle']); | |
// send the return values to the callback function. | |
$callback = $this->callback; | |
if (is_callable($callback)) | |
{ | |
$key = (string)$done['handle']; | |
$request = $this->requests[$this->requestMap[$key]]; | |
unset($this->requestMap[$key]); | |
call_user_func($callback, $output, $info, $request); | |
} | |
// start a new request (it's important to do this before removing the old one) | |
$n = count($this->requests); | |
if (($i < $n) && isset($this->requests[$i])) | |
{ | |
$ch = curl_init(); | |
$options = $this->get_options($this->requests[$i]); | |
curl_setopt_array($ch, $options); | |
curl_multi_add_handle($master, $ch); | |
// Add to our request Maps | |
$key = (string)$ch; | |
$this->requestMap[$key] = $i; | |
$i++; | |
} | |
// remove the curl handle that just completed | |
curl_multi_remove_handle($master, $done['handle']); | |
} | |
// Block for data in / output; error handling is done by curl_multi_exec | |
if ($running) | |
{ | |
curl_multi_select($master, $this->timeout); | |
} | |
} while ($running); | |
return true; | |
} | |
/** | |
* Helper function to set up a new request by setting the appropriate options | |
* | |
* @access private | |
* @param Request $request | |
* @return array | |
*/ | |
private function get_options($request) | |
{ | |
$options = $this->__get('options'); | |
$headers = $this->__get('headers'); | |
// set the request URL | |
$options[CURLOPT_URL] = $request->url; | |
// set the request method | |
// curl默认就是get,设定post_data,既可认为请求是post请求 | |
// posting data w/ this request? | |
if ($request->post_data) | |
{ | |
$options[CURLOPT_POST] = true; | |
$options[CURLOPT_POSTFIELDS] = $request->post_data; | |
} | |
// append custom options for this specific request | |
if ($request->options) | |
{ | |
$options = $options + $request->options; | |
} | |
// 添加个性header | |
if ($request->headers) | |
{ | |
$headers = $headers + $request->headers; | |
} | |
$options[CURLOPT_HTTPHEADER] = $headers; | |
return $options; | |
} | |
private function createRequest($url, $method, $headers, $options, $data = array()) | |
{ | |
$o = new stdClass(); | |
$o->url = $url; | |
$o->method = $method; | |
$o->headers = $headers; | |
$o->options = $options; | |
$o->post_data = $data; | |
if (!isset($options[CURLOPT_USERAGENT])) | |
{ | |
$o->options[CURLOPT_USERAGENT] = self::$agent[array_rand(self::$agent)]; | |
} | |
return $o; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment