Skip to content

Instantly share code, notes, and snippets.

@luckyshot
Last active November 1, 2024 08:20
Show Gist options
  • Save luckyshot/5395600 to your computer and use it in GitHub Desktop.
Save luckyshot/5395600 to your computer and use it in GitHub Desktop.
Web scraping done right (with cUrl and user agent)
<?php return array (
'url' => 'https://xaviesteve.com/',
'content' => '<!doctype html><html>...</html>',
'cookies' => '__cfduid=d3fa669e1069e72c2e47d127ab9b8e11f1465390629',
'http_code' => 200,
'content_type' => 'text/html; charset=UTF-8',
'header_size' => 578,
'request_size' => 229,
'filetime' => -1,
'ssl_verify_result' => 0,
'redirect_count' => 0,
'total_time' => 0.27407799999999999,
'namelookup_time' => 0.028674000000000002,
'connect_time' => 0.030345,
'pretransfer_time' => 0.046539999999999998,
'size_upload' => 0,
'size_download' => 5876,
'speed_download' => 21439,
'speed_upload' => 0,
'download_content_length' => -1,
'upload_content_length' => 0,
'starttransfer_time' => 0.27336100000000002,
'redirect_time' => 0,
'redirect_url' => '',
'primary_ip' => '104.28.5.26',
'certinfo' =>
array (
),
'primary_port' => 443,
'local_ip' => '178.62.98.107',
'local_port' => 46821,
'request_header' => 'GET / HTTP/1.1
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36
Host: xaviesteve.com
Accept: */*
Accept-Encoding: deflate, gzip
Cookie:
',
'errno' => 0,
'errmsg' => '',
'headers' => 'HTTP/1.1 200 OK
Date: Wed, 08 Jun 2016 12:57:09 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: keep-alive
Set-Cookie: __cfduid=d3fa669e1069e72c2e47d127ab9b8e11f1465390629; expires=Thu, 08-Jun-17 12:57:09 GMT; path=/; domain=.xaviesteve.com; HttpOnly
Vary: Accept-Encoding
Link: <https://xaviesteve.com/wp-json/>; rel="https://api.w.org/"
Link: <https://xaviesteve.com/>; rel=shortlink
Cache-Control: max-age=3600
Expires: Wed, 08 Jun 2016 13:57:09 GMT
Server: cloudflare-nginx
CF-RAY: 2afc82084d1c3530-LHR
Content-Encoding: gzip
'
);
<?php
/*
v3.0.1
$ws = new WebScrap;
$html = $ws->curl("http://www.example.com/", [
'cookies' => 'fruit=apple; colour=red',
'headers' => [
'Authorization: Bearer AbCdEfGhIjKlMnOpQ',
'Content-Type: application/json',
],
'post' => [
'firstname' => 'Xavi',
'lastname' => 'Esteve'
],
'userpass' => 'admin:password',
);
echo $ws->regex( "#This domain is established to be used for (.*?) examples in documents#mi", $html['content'] )[0][1];
*/
class WebScrap {
/*
$custom = [
'cookies' => 'name1=content1; name2=content2;',
'headers' => ['Authorization: Bearer AbCdEfGhIjKlMnOpQ','Content-Type: application/json'],
'post' => ['firstname' => 'Xavi','lastname' => 'Esteve'],
'user_agent' => '', // if none set, it will randomize from the list
'userpass' => 'clark:kent',
];
*/
public function curl( $url, $custom = [] ){
$user_agent = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) AppleWebKit/601.1.56 (KHTML, like Gecko) Version/9.0 Safari/601.1.56',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13',
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)',
];
// http://php.net/manual/en/function.curl-setopt.php
$options = [
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => true, //return headers in addition to content
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
CURLINFO_HEADER_OUT => true,
CURLOPT_SSL_VERIFYPEER => false, // Disabled SSL Cert checks
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_COOKIE => ( array_key_exists('cookies', $custom) ? $custom['cookies'] : null ),
CURLOPT_USERAGENT => ( array_key_exists('user_agent', $custom) ? $custom['user_agent'] : $user_agent[ array_rand($user_agent) ] ),
];
// Headers
if ( array_key_exists('headers', $custom) AND is_array( $custom['headers'] ) ) {
$options[ CURLOPT_HTTPHEADER ] = $custom['headers'];
}
// Post data (put as PHP array, this converts to JSON)
if ( array_key_exists('post', $custom) AND is_array( $custom['post'] ) ) {
$options[ CURLOPT_POST ] = true;
$options[ CURLOPT_POSTFIELDS ] = $custom['post'];
}
if ( array_key_exists('userpass', $custom) ) {
$options[ CURLOPT_USERPWD ] = $custom['userpass'];
}
$ch = curl_init( $url );
curl_setopt_array( $ch, $options );
$rough_content = curl_exec( $ch );
$err = curl_errno( $ch );
$errmsg = curl_error( $ch );
$header = curl_getinfo( $ch );
curl_close( $ch );
$header_content = substr( $rough_content, 0, $header['header_size'] );
$body_content = trim( str_replace( $header_content, '', $rough_content ) );
preg_match_all( "#Set-Cookie:\\s+(?<cookie>[^=]+=[^;]+)#m", $header_content, $matches );
$cookiesOut = implode( "; ", $matches['cookie'] );
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['headers'] = $header_content;
$header['content'] = $body_content;
$header['cookies'] = $cookiesOut;
return $header;
}
public function regex( $regex, $string )
{
// regex flags: http://php.net/manual/en/reference.pcre.pattern.modifiers.php
preg_match_all(
$regex,
$string,
$matches,
PREG_SET_ORDER // formats data into an array of items
);
return $matches;
}
}
@its-zero-to-infinity
Copy link

How to use it or installation process. nothing showing on webpage

@wackyapps
Copy link

wackyapps commented Oct 14, 2017

@lucyshot -> very nice. Indeed this helped me. (y)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment