Skip to content

Instantly share code, notes, and snippets.

@luxixing
Created October 23, 2013 10:54
Show Gist options
  • Save luxixing/7116529 to your computer and use it in GitHub Desktop.
Save luxixing/7116529 to your computer and use it in GitHub Desktop.
php-curl-multi 使用php curl多进程请求url,抓取页面的类
<?php
/**
* Use php curl multi, rolling request url.
*
* @author [email protected]
*/
class CurlRoll
{
/**
* @var int
* 并发请求数,设置此值过大,同一时间内如果请求远端主机会很容易被判定为DDos攻击
*/
private $window_size = 5;
/**
* @var float
* curl_multi_select 处理超时时间.
*/
private $timeout = 10;
/**
* @var array
* 请求对象 CurlRequest 实例数组
*/
private $requests = array();
/**
* @var array
* 并发请求map
*/
private $requestMap = array();
/**
* @var string|array
* callback function,结果处理回调函数.
*/
private $callback;
/**
* @var array
* HTTP request default options.
*/
private $options = array(
CURLOPT_SSL_VERIFYPEER => 0, //不开启https请求
CURLOPT_RETURNTRANSFER => 1, //请求信息以文件流方式返回
CURLOPT_CONNECTTIMEOUT => 10, //连接超时时间
CURLOPT_TIMEOUT => 20, //设置curl执行最大时间
CURLOPT_FOLLOWLOCATION => 1, //curl允许根据response location的值重定向请求
CURLOPT_MAXREDIRS => 5, //CURLOPT_FOLLOWLOCATION为真后,此值设定重定向递归最大次数
CURLOPT_HEADER => 0, //设置为true,请求返回的文件流中就会包含response header
CURLOPT_AUTOREFERER => true, //当根据Location重定向时,自动设置header中的referer信息
CURLOPT_ENCODING => "", //HTTP请求头中"Accept-Encoding"的值,为空发送所有支持的编码类型
);
/**
* @var array
* HTTP Request发送的header信息
*/
private $headers = array(
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
'Connection: close',
'Cache-Control: max-age=0',
//'X-FORWARD-FOR:8.8.8.8', //代理ip地址
//'CLIENT-IP:3.3.3.3', //客户端ip,REMOTE_ADDR不为空的情况下,是比较真是ip,不好伪造
);
private static $agent = array(
//google chrome
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
//firefox
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
//ie
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)',
);
/**
* @param int
* $window_size
*/
public function __construct($window_size = 5)
{
$this->window_size = (int)$window_size ? : 5;
}
/**
* @return void
*/
public function __destruct()
{
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
}
/**
* @param string $name
* @return mixed
*/
public function __get($name)
{
return isset($this->{$name}) ? $this->{$name} : null;
}
/**
* @param string $name
* @param mixed $value
* @return bool
*/
public function __set($name, $value)
{
// append the base options & headers
if ($name == "options" || $name == "headers")
{
$this->{$name} = $value + $this->{$name};
} else
{
$this->{$name} = $value;
}
return true;
}
/**
* Add a request to the request queue
*
* @param $url
* @return bool
*/
public function add($url)
{
$this->requests[] = $this->createRequest($url, 'GET', $this->headers, $this->options);
return true;
}
/**
* Perform GET request
*
* @param string $url
* @param $headers 不是key-value数组,http请求request header部分的内容
* $headers = array(
* "POST ".$page." HTTP/1.0",
* "Content-type: text/xml;charset=\"utf-8\"",
* "Accept: text/xml",
* "Cache-Control: no-cache",
* "Pragma: no-cache",
* "SOAPAction: \"run\"",
* "Content-length: ".strlen($xml_data),
* "Authorization: Basic " . base64_encode($credentials)
* );
* @param $options
* @return bool
*/
public function get($url, $headers = array(), $options = array())
{
$this->requests[] = $this->createRequest($url, "GET", $headers, $options);
return true;
}
/**
* Perform POST request
*
* @param string $url
* @param $post_data
* @param $headers
* @param $options
* @return bool
*/
public function post($url, $headers = array(), $options = array(), $post_data)
{
$this->requests[] = $this->createRequest($url, "POST", $headers, $options, $post_data);
return true;
}
/**
* Execute processing
*
* @param mixed $callback
* @return string|null
*/
public function execute($callback = null)
{
$ret = null;
if ($callback)
{
$this->callback = $callback;
}
if (count($this->requests) == 1)
{
$ret = $this->single_curl();
} else
{
$ret = $this->rolling_curl();
}
//clear all request once time
$this->requests = $this->requestMap = array();
return $ret;
}
/**
* Performs a single curl request
*
* @access private
* @return string
*/
private function single_curl()
{
$ch = curl_init();
$request = array_shift($this->requests);
$options = $this->get_options($request);
curl_setopt_array($ch, $options);
$output = curl_exec($ch);
$info = curl_getinfo($ch);
if ($this->callback && is_callable($this->callback))
{
$callback = $this->callback;
return call_user_func($callback, $output, $info, $request);
} else
{
return $output;
}
}
/**
* Performs multiple curl requests
*
* @access private
* @return bool
*/
private function rolling_curl()
{
$n = count($this->requests);
if ($n < $this->window_size)
{
$this->window_size = $n;
}
if ($this->window_size < 2)
{
return false;
}
$master = curl_multi_init();
// start the first batch of requests
//注意变量i的作用域不是for循环体内,在后续还是可以使用的
for($i = 0; $i < $this->window_size; $i++)
{
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
$key = (string)$ch;
$this->requestMap[$key] = $i;
}
do
{
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
if ($execrun != CURLM_OK)
{
break;
}
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($master))
{
// get the info and content returned on the request
$info = curl_getinfo($done['handle']);
$output = curl_multi_getcontent($done['handle']);
// send the return values to the callback function.
$callback = $this->callback;
if (is_callable($callback))
{
$key = (string)$done['handle'];
$request = $this->requests[$this->requestMap[$key]];
unset($this->requestMap[$key]);
call_user_func($callback, $output, $info, $request);
}
// start a new request (it's important to do this before removing the old one)
$n = count($this->requests);
if (($i < $n) && isset($this->requests[$i]))
{
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// Add to our request Maps
$key = (string)$ch;
$this->requestMap[$key] = $i;
$i++;
}
// remove the curl handle that just completed
curl_multi_remove_handle($master, $done['handle']);
}
// Block for data in / output; error handling is done by curl_multi_exec
if ($running)
{
curl_multi_select($master, $this->timeout);
}
} while ($running);
return true;
}
/**
* Helper function to set up a new request by setting the appropriate options
*
* @access private
* @param Request $request
* @return array
*/
private function get_options($request)
{
$options = $this->__get('options');
$headers = $this->__get('headers');
// set the request URL
$options[CURLOPT_URL] = $request->url;
// set the request method
// curl默认就是get,设定post_data,既可认为请求是post请求
// posting data w/ this request?
if ($request->post_data)
{
$options[CURLOPT_POST] = true;
$options[CURLOPT_POSTFIELDS] = $request->post_data;
}
// append custom options for this specific request
if ($request->options)
{
$options = $options + $request->options;
}
// 添加个性header
if ($request->headers)
{
$headers = $headers + $request->headers;
}
$options[CURLOPT_HTTPHEADER] = $headers;
return $options;
}
private function createRequest($url, $method, $headers, $options, $data = array())
{
$o = new stdClass();
$o->url = $url;
$o->method = $method;
$o->headers = $headers;
$o->options = $options;
$o->post_data = $data;
if (!isset($options[CURLOPT_USERAGENT]))
{
$o->options[CURLOPT_USERAGENT] = self::$agent[array_rand(self::$agent)];
}
return $o;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment