Last active
November 1, 2024 08:20
-
-
Save luckyshot/5395600 to your computer and use it in GitHub Desktop.
Web scraping done right (with cUrl and user agent)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php return array ( | |
'url' => 'https://xaviesteve.com/', | |
'content' => '<!doctype html><html>...</html>', | |
'cookies' => '__cfduid=d3fa669e1069e72c2e47d127ab9b8e11f1465390629', | |
'http_code' => 200, | |
'content_type' => 'text/html; charset=UTF-8', | |
'header_size' => 578, | |
'request_size' => 229, | |
'filetime' => -1, | |
'ssl_verify_result' => 0, | |
'redirect_count' => 0, | |
'total_time' => 0.27407799999999999, | |
'namelookup_time' => 0.028674000000000002, | |
'connect_time' => 0.030345, | |
'pretransfer_time' => 0.046539999999999998, | |
'size_upload' => 0, | |
'size_download' => 5876, | |
'speed_download' => 21439, | |
'speed_upload' => 0, | |
'download_content_length' => -1, | |
'upload_content_length' => 0, | |
'starttransfer_time' => 0.27336100000000002, | |
'redirect_time' => 0, | |
'redirect_url' => '', | |
'primary_ip' => '104.28.5.26', | |
'certinfo' => | |
array ( | |
), | |
'primary_port' => 443, | |
'local_ip' => '178.62.98.107', | |
'local_port' => 46821, | |
'request_header' => 'GET / HTTP/1.1 | |
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36 | |
Host: xaviesteve.com | |
Accept: */* | |
Accept-Encoding: deflate, gzip | |
Cookie: | |
', | |
'errno' => 0, | |
'errmsg' => '', | |
'headers' => 'HTTP/1.1 200 OK | |
Date: Wed, 08 Jun 2016 12:57:09 GMT | |
Content-Type: text/html; charset=UTF-8 | |
Transfer-Encoding: chunked | |
Connection: keep-alive | |
Set-Cookie: __cfduid=d3fa669e1069e72c2e47d127ab9b8e11f1465390629; expires=Thu, 08-Jun-17 12:57:09 GMT; path=/; domain=.xaviesteve.com; HttpOnly | |
Vary: Accept-Encoding | |
Link: <https://xaviesteve.com/wp-json/>; rel="https://api.w.org/" | |
Link: <https://xaviesteve.com/>; rel=shortlink | |
Cache-Control: max-age=3600 | |
Expires: Wed, 08 Jun 2016 13:57:09 GMT | |
Server: cloudflare-nginx | |
CF-RAY: 2afc82084d1c3530-LHR | |
Content-Encoding: gzip | |
' | |
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
v3.0.1 | |
$ws = new WebScrap; | |
$html = $ws->curl("http://www.example.com/", [ | |
'cookies' => 'fruit=apple; colour=red', | |
'headers' => [ | |
'Authorization: Bearer AbCdEfGhIjKlMnOpQ', | |
'Content-Type: application/json', | |
], | |
'post' => [ | |
'firstname' => 'Xavi', | |
'lastname' => 'Esteve' | |
], | |
'userpass' => 'admin:password', | |
); | |
echo $ws->regex( "#This domain is established to be used for (.*?) examples in documents#mi", $html['content'] )[0][1]; | |
*/ | |
class WebScrap { | |
/* | |
$custom = [ | |
'cookies' => 'name1=content1; name2=content2;', | |
'headers' => ['Authorization: Bearer AbCdEfGhIjKlMnOpQ','Content-Type: application/json'], | |
'post' => ['firstname' => 'Xavi','lastname' => 'Esteve'], | |
'user_agent' => '', // if none set, it will randomize from the list | |
'userpass' => 'clark:kent', | |
]; | |
*/ | |
public function curl( $url, $custom = [] ){ | |
$user_agent = [ | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) AppleWebKit/601.1.56 (KHTML, like Gecko) Version/9.0 Safari/601.1.56', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko', | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13', | |
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)', | |
]; | |
// http://php.net/manual/en/function.curl-setopt.php | |
$options = [ | |
CURLOPT_RETURNTRANSFER => true, // return web page | |
CURLOPT_HEADER => true, //return headers in addition to content | |
CURLOPT_FOLLOWLOCATION => true, // follow redirects | |
CURLOPT_ENCODING => "", // handle all encodings | |
CURLOPT_AUTOREFERER => true, // set referer on redirect | |
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect | |
CURLOPT_TIMEOUT => 120, // timeout on response | |
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects | |
CURLINFO_HEADER_OUT => true, | |
CURLOPT_SSL_VERIFYPEER => false, // Disabled SSL Cert checks | |
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, | |
CURLOPT_COOKIE => ( array_key_exists('cookies', $custom) ? $custom['cookies'] : null ), | |
CURLOPT_USERAGENT => ( array_key_exists('user_agent', $custom) ? $custom['user_agent'] : $user_agent[ array_rand($user_agent) ] ), | |
]; | |
// Headers | |
if ( array_key_exists('headers', $custom) AND is_array( $custom['headers'] ) ) { | |
$options[ CURLOPT_HTTPHEADER ] = $custom['headers']; | |
} | |
// Post data (put as PHP array, this converts to JSON) | |
if ( array_key_exists('post', $custom) AND is_array( $custom['post'] ) ) { | |
$options[ CURLOPT_POST ] = true; | |
$options[ CURLOPT_POSTFIELDS ] = $custom['post']; | |
} | |
if ( array_key_exists('userpass', $custom) ) { | |
$options[ CURLOPT_USERPWD ] = $custom['userpass']; | |
} | |
$ch = curl_init( $url ); | |
curl_setopt_array( $ch, $options ); | |
$rough_content = curl_exec( $ch ); | |
$err = curl_errno( $ch ); | |
$errmsg = curl_error( $ch ); | |
$header = curl_getinfo( $ch ); | |
curl_close( $ch ); | |
$header_content = substr( $rough_content, 0, $header['header_size'] ); | |
$body_content = trim( str_replace( $header_content, '', $rough_content ) ); | |
preg_match_all( "#Set-Cookie:\\s+(?<cookie>[^=]+=[^;]+)#m", $header_content, $matches ); | |
$cookiesOut = implode( "; ", $matches['cookie'] ); | |
$header['errno'] = $err; | |
$header['errmsg'] = $errmsg; | |
$header['headers'] = $header_content; | |
$header['content'] = $body_content; | |
$header['cookies'] = $cookiesOut; | |
return $header; | |
} | |
public function regex( $regex, $string ) | |
{ | |
// regex flags: http://php.net/manual/en/reference.pcre.pattern.modifiers.php | |
preg_match_all( | |
$regex, | |
$string, | |
$matches, | |
PREG_SET_ORDER // formats data into an array of items | |
); | |
return $matches; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@lucyshot -> very nice. Indeed this helped me. (y)