Skip to content

Instantly share code, notes, and snippets.

@id4ehsan
Created April 13, 2018 03:33
Show Gist options
  • Save id4ehsan/114cc88fefbf73ba90c3f11f3ef2ddb3 to your computer and use it in GitHub Desktop.
Save id4ehsan/114cc88fefbf73ba90c3f11f3ef2ddb3 to your computer and use it in GitHub Desktop.
data grabber
<?php
$servername = "localhost";
$username = "cp29857_test";
$password = "g0nd4l0f123";
$myDB = "cp29857_test";
try {
$conn = new PDO("mysql:host=$servername;dbname=$myDB", $username, $password);
// set the PDO error mode to exception
$conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
//echo "Connected successfully";
}
catch(PDOException $e)
{
echo "Connection failed: " . $e->getMessage();
}
function url_get_contents($url, $useragent='cURL', $headers=false, $follow_redirects=true, $debug=false) {
// initialise the CURL library
$ch = curl_init();
// specify the URL to be retrieved
curl_setopt($ch, CURLOPT_URL,$url);
// we want to get the contents of the URL and store it in a variable
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
// specify the useragent: this is a required courtesy to site owners
curl_setopt($ch, CURLOPT_USERAGENT, $useragent);
// ignore SSL errors
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
// return headers as requested
if ($headers==true){
curl_setopt($ch, CURLOPT_HEADER,1);
}
// only return headers
if ($headers=='headers only') {
curl_setopt($ch, CURLOPT_NOBODY ,1);
}
// follow redirects - note this is disabled by default in most PHP installs from 4.4.4 up
if ($follow_redirects==true) {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
}
// if debugging, return an array with CURL's debug info and the URL contents
if ($debug==true) {
$result['contents']=curl_exec($ch);
$result['info']=curl_getinfo($ch);
}
// otherwise just return the contents as a variable
else $result=curl_exec($ch);
// free resources
curl_close($ch);
// send back the data
return $result;
}
//needs "php_curl" to be enabled (+php_openssl)
function get_remote_data($url, $post_paramtrs=false, $return_full_array=false) {
$c = curl_init();curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL)
if($post_paramtrs){curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, "var1=bla&".$post_paramtrs );}
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false);
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0");
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;');
//We'd better to use the above command, because the following command gave some weird STATUS results..
//$header[0]= $user_agent="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; $header[]="Cookie:CookieName1=Value;"; $header[]="Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; $header[]="Cache-Control: max-age=0"; $header[]="Connection: keep-alive"; $header[]="Keep-Alive: 300"; $header[]="Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; $header[] = "Accept-Language: en-us,en;q=0.5"; $header[] = "Pragma: "; curl_setopt($c, CURLOPT_HEADER, true); curl_setopt($c, CURLOPT_HTTPHEADER, $header);
curl_setopt($c, CURLOPT_MAXREDIRS, 10);
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so...
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);}
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9);
curl_setopt($c, CURLOPT_REFERER, $url);
curl_setopt($c, CURLOPT_TIMEOUT, 60);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_ENCODING, 'gzip,deflate');
$data=curl_exec($c);$status=curl_getinfo($c);curl_close($c);
preg_match('/(http(|s)):\/\/(.*?)\/(.*\/|)/si', $status['url'],$link);
//correct assets URLs(i.e. retrieved url is: http://example.com/DIR/SUBDIR/page.html... then href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG", but href="/image.JPG" needs to become href="http://example.com/image.JPG")
//inside all links(except starting with HTTP,javascript:,HTTPS,//,/ ) insert that current DIRECTORY url (href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG")
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/|\/)).*?)(\'|\")/si','$1=$2'.$link[0].'$3$4$5', $data);
//inside all links(except starting with HTTP,javascript:,HTTPS,//) insert that DOMAIN url (href="/image.JPG" becomes href="http://example.com/image.JPG")
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/)).*?)(\'|\")/si','$1=$2'.$link[1].'://'.$link[3].'$3$4$5', $data);
// if redirected, then get that redirected page
if($status['http_code']==301 || $status['http_code']==302) {
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :)
if (!$follow_allowed){
//if REDIRECT URL is found in HEADER
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}}
//if REDIRECT URL is found in RESPONSE
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } }
//if REDIRECT URL is found in OUTPUT
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } }
//if URL found, then re-use this function again, for the found url
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);}
}
}
// if not redirected,and nor "status 200" page, then error..
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";}
return ( $return_full_array ? array('data'=>$data,'info'=>$status) : $data);
}
$homepage = get_remote_data("https://www.dotproperty.co.th/en");
preg_match_all('|href="(?:(?!javascript))(https:\/\/www\.dotproperty\.[^"]+?)"|isU',
$homepage,
$outs, PREG_SET_ORDER);
//echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">".print_r($outs)."</textarea>";
echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">";
//for ($i=0 ; $i < sizeof($out) ; $i++){
// echo $out[i][0];
//}
foreach ($outs as $out){
echo $out[1] . "\n";
}
echo "</textarea>";
echo "<p>Finnished</p>";
?>
<?php
set_time_limit(0);
ini_set('max_execution_time', 0);
function get_remote_data($url, $post_paramtrs=false, $return_full_array=false) {
$c = curl_init();curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL)
if($post_paramtrs){curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, "var1=bla&".$post_paramtrs );}
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false);
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0");
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;');
//We'd better to use the above command, because the following command gave some weird STATUS results..
$header[0]= $user_agent="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; $header[]="Cookie:CookieName1=Value;"; $header[]="Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; $header[]="Cache-Control: max-age=0"; $header[]="Connection: keep-alive"; $header[]="Keep-Alive: 300"; $header[]="Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; $header[] = "Accept-Language: en-us,en;q=0.5"; $header[] = "Pragma: "; curl_setopt($c, CURLOPT_HEADER, true); curl_setopt($c, CURLOPT_HTTPHEADER, $header);
curl_setopt($c, CURLOPT_MAXREDIRS, 10);
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so...
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);}
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9);
curl_setopt($c, CURLOPT_REFERER, $url);
curl_setopt($c, CURLOPT_TIMEOUT, 60);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_ENCODING, 'gzip,deflate');
$data=curl_exec($c);$status=curl_getinfo($c);curl_close($c);
preg_match('/(http(|s)):\/\/(.*?)\/(.*\/|)/si', $status['url'],$link);
//correct assets URLs(i.e. retrieved url is: http://example.com/DIR/SUBDIR/page.html... then href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG", but href="/image.JPG" needs to become href="http://example.com/image.JPG")
//inside all links(except starting with HTTP,javascript:,HTTPS,//,/ ) insert that current DIRECTORY url (href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG")
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/|\/)).*?)(\'|\")/si','$1=$2'.$link[0].'$3$4$5', $data);
//inside all links(except starting with HTTP,javascript:,HTTPS,//) insert that DOMAIN url (href="/image.JPG" becomes href="http://example.com/image.JPG")
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/)).*?)(\'|\")/si','$1=$2'.$link[1].'://'.$link[3].'$3$4$5', $data);
// if redirected, then get that redirected page
if($status['http_code']==301 || $status['http_code']==302) {
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :)
if (!$follow_allowed){
//if REDIRECT URL is found in HEADER
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}}
//if REDIRECT URL is found in RESPONSE
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } }
//if REDIRECT URL is found in OUTPUT
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } }
//if URL found, then re-use this function again, for the found url
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);}
}
}
// if not redirected,and nor "status 200" page, then error..
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";}
return ( $return_full_array ? array('data'=>$data,'info'=>$status) : $data);
}
echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">";
$data = get_remote_data("http://www.o-xe.com/");
//echo $data;
preg_match_all("|<td\s+style='background-image\s*:\s+url\(\/images\/icons\/([^\.]+).png\);background-repeat\s*:\s+no-repeat;background-position\s*:\s+left\s+center;cursor:pointer'\s+\s+class='MonetaryText'\s+title='([^']*)'\s+id='([^']*)'><b>([^<]*)<\/b><\/td><td\s+align='center'\s+class='MonetaryBuy'>([0-9,]*)<\/td><td\s+align='center'\s+class='MonetarySell'>([0-9,]*)<\/td>|isU",
$data,
$outs,
PREG_SET_ORDER);
//echo $data;
foreach ($outs as $out){
echo $out[1] . " ".$out[2] . " ".$out[3] . " ".$out[4] . " ".$out[5] . " ".$out[6] . " ". "\n";
}
//print_r($outs);
echo "</textarea>";
echo "<p>Finnished</p>";
?>
<?php
set_time_limit(0);
ini_set('max_execution_time', 0);
function get_remote_data($url, $post_paramtrs=false, $return_full_array=false) {
$c = curl_init();curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL)
if($post_paramtrs){curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, "var1=bla&".$post_paramtrs );}
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false);
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0");
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;');
//We'd better to use the above command, because the following command gave some weird STATUS results..
$header[0]= $user_agent="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; $header[]="Cookie:CookieName1=Value;"; $header[]="Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; $header[]="Cache-Control: max-age=0"; $header[]="Connection: keep-alive"; $header[]="Keep-Alive: 300"; $header[]="Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; $header[] = "Accept-Language: en-us,en;q=0.5"; $header[] = "Pragma: "; curl_setopt($c, CURLOPT_HEADER, true); curl_setopt($c, CURLOPT_HTTPHEADER, $header);
curl_setopt($c, CURLOPT_MAXREDIRS, 10);
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so...
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);}
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9);
curl_setopt($c, CURLOPT_REFERER, $url);
curl_setopt($c, CURLOPT_TIMEOUT, 60);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_ENCODING, 'gzip,deflate');
$data=curl_exec($c);$status=curl_getinfo($c);curl_close($c);
preg_match('/(http(|s)):\/\/(.*?)\/(.*\/|)/si', $status['url'],$link);
//correct assets URLs(i.e. retrieved url is: http://example.com/DIR/SUBDIR/page.html... then href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG", but href="/image.JPG" needs to become href="http://example.com/image.JPG")
//inside all links(except starting with HTTP,javascript:,HTTPS,//,/ ) insert that current DIRECTORY url (href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG")
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/|\/)).*?)(\'|\")/si','$1=$2'.$link[0].'$3$4$5', $data);
//inside all links(except starting with HTTP,javascript:,HTTPS,//) insert that DOMAIN url (href="/image.JPG" becomes href="http://example.com/image.JPG")
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/)).*?)(\'|\")/si','$1=$2'.$link[1].'://'.$link[3].'$3$4$5', $data);
// if redirected, then get that redirected page
if($status['http_code']==301 || $status['http_code']==302) {
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :)
if (!$follow_allowed){
//if REDIRECT URL is found in HEADER
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}}
//if REDIRECT URL is found in RESPONSE
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } }
//if REDIRECT URL is found in OUTPUT
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } }
//if URL found, then re-use this function again, for the found url
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);}
}
}
// if not redirected,and nor "status 200" page, then error..
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";}
return ( $return_full_array ? array('data'=>$data,'info'=>$status) : $data);
}
echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">";
for ($i=1 ; $i <= 50 ; $i++) {
if($i == 1){
$data = get_remote_data("https://www.dotproperty.co.th/en/properties-for-rent");
}else{
$data = get_remote_data("https://www.dotproperty.co.th/en/properties-for-rent?page=$i");
}
preg_match_all('|<div\s*class="wrapper">\s*<div\s*class="left-block">\s*<a\s*target="?_blank"?\s*data-tracking="[^"]*"\s*href="(https:\/\/www\.dotproperty\.co\.th\/en\/[^"]*)"\s*title="([^"]*)"\s*onmousedown="[^"]*">|isU',
$data,
$outs, PREG_SET_ORDER);
foreach ($outs as $out){
echo $out[1] . "\n";
}
}
echo "</textarea>";
echo "<p>Finnished</p>";
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment