Created
April 13, 2018 03:33
-
-
Save id4ehsan/114cc88fefbf73ba90c3f11f3ef2ddb3 to your computer and use it in GitHub Desktop.
data grabber
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$servername = "localhost"; | |
$username = "cp29857_test"; | |
$password = "g0nd4l0f123"; | |
$myDB = "cp29857_test"; | |
try { | |
$conn = new PDO("mysql:host=$servername;dbname=$myDB", $username, $password); | |
// set the PDO error mode to exception | |
$conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); | |
//echo "Connected successfully"; | |
} | |
catch(PDOException $e) | |
{ | |
echo "Connection failed: " . $e->getMessage(); | |
} | |
function url_get_contents($url, $useragent='cURL', $headers=false, $follow_redirects=true, $debug=false) { | |
// initialise the CURL library | |
$ch = curl_init(); | |
// specify the URL to be retrieved | |
curl_setopt($ch, CURLOPT_URL,$url); | |
// we want to get the contents of the URL and store it in a variable | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); | |
// specify the useragent: this is a required courtesy to site owners | |
curl_setopt($ch, CURLOPT_USERAGENT, $useragent); | |
// ignore SSL errors | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
// return headers as requested | |
if ($headers==true){ | |
curl_setopt($ch, CURLOPT_HEADER,1); | |
} | |
// only return headers | |
if ($headers=='headers only') { | |
curl_setopt($ch, CURLOPT_NOBODY ,1); | |
} | |
// follow redirects - note this is disabled by default in most PHP installs from 4.4.4 up | |
if ($follow_redirects==true) { | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); | |
} | |
// if debugging, return an array with CURL's debug info and the URL contents | |
if ($debug==true) { | |
$result['contents']=curl_exec($ch); | |
$result['info']=curl_getinfo($ch); | |
} | |
// otherwise just return the contents as a variable | |
else $result=curl_exec($ch); | |
// free resources | |
curl_close($ch); | |
// send back the data | |
return $result; | |
} | |
//needs "php_curl" to be enabled (+php_openssl) | |
function get_remote_data($url, $post_paramtrs=false, $return_full_array=false) { | |
$c = curl_init();curl_setopt($c, CURLOPT_URL, $url); | |
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1); | |
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL) | |
if($post_paramtrs){curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, "var1=bla&".$post_paramtrs );} | |
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false); | |
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false); | |
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"); | |
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;'); | |
//We'd better to use the above command, because the following command gave some weird STATUS results.. | |
//$header[0]= $user_agent="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; $header[]="Cookie:CookieName1=Value;"; $header[]="Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; $header[]="Cache-Control: max-age=0"; $header[]="Connection: keep-alive"; $header[]="Keep-Alive: 300"; $header[]="Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; $header[] = "Accept-Language: en-us,en;q=0.5"; $header[] = "Pragma: "; curl_setopt($c, CURLOPT_HEADER, true); curl_setopt($c, CURLOPT_HTTPHEADER, $header); | |
curl_setopt($c, CURLOPT_MAXREDIRS, 10); | |
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so... | |
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);} | |
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9); | |
curl_setopt($c, CURLOPT_REFERER, $url); | |
curl_setopt($c, CURLOPT_TIMEOUT, 60); | |
curl_setopt($c, CURLOPT_AUTOREFERER, true); | |
curl_setopt($c, CURLOPT_ENCODING, 'gzip,deflate'); | |
$data=curl_exec($c);$status=curl_getinfo($c);curl_close($c); | |
preg_match('/(http(|s)):\/\/(.*?)\/(.*\/|)/si', $status['url'],$link); | |
//correct assets URLs(i.e. retrieved url is: http://example.com/DIR/SUBDIR/page.html... then href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG", but href="/image.JPG" needs to become href="http://example.com/image.JPG") | |
//inside all links(except starting with HTTP,javascript:,HTTPS,//,/ ) insert that current DIRECTORY url (href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG") | |
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/|\/)).*?)(\'|\")/si','$1=$2'.$link[0].'$3$4$5', $data); | |
//inside all links(except starting with HTTP,javascript:,HTTPS,//) insert that DOMAIN url (href="/image.JPG" becomes href="http://example.com/image.JPG") | |
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/)).*?)(\'|\")/si','$1=$2'.$link[1].'://'.$link[3].'$3$4$5', $data); | |
// if redirected, then get that redirected page | |
if($status['http_code']==301 || $status['http_code']==302) { | |
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL | |
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :) | |
if (!$follow_allowed){ | |
//if REDIRECT URL is found in HEADER | |
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}} | |
//if REDIRECT URL is found in RESPONSE | |
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } } | |
//if REDIRECT URL is found in OUTPUT | |
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } } | |
//if URL found, then re-use this function again, for the found url | |
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);} | |
} | |
} | |
// if not redirected,and nor "status 200" page, then error.. | |
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";} | |
return ( $return_full_array ? array('data'=>$data,'info'=>$status) : $data); | |
} | |
$homepage = get_remote_data("https://www.dotproperty.co.th/en"); | |
preg_match_all('|href="(?:(?!javascript))(https:\/\/www\.dotproperty\.[^"]+?)"|isU', | |
$homepage, | |
$outs, PREG_SET_ORDER); | |
//echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">".print_r($outs)."</textarea>"; | |
echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">"; | |
//for ($i=0 ; $i < sizeof($out) ; $i++){ | |
// echo $out[i][0]; | |
//} | |
foreach ($outs as $out){ | |
echo $out[1] . "\n"; | |
} | |
echo "</textarea>"; | |
echo "<p>Finnished</p>"; | |
?> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
set_time_limit(0); | |
ini_set('max_execution_time', 0); | |
function get_remote_data($url, $post_paramtrs=false, $return_full_array=false) { | |
$c = curl_init();curl_setopt($c, CURLOPT_URL, $url); | |
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1); | |
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL) | |
if($post_paramtrs){curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, "var1=bla&".$post_paramtrs );} | |
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false); | |
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false); | |
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"); | |
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;'); | |
//We'd better to use the above command, because the following command gave some weird STATUS results.. | |
$header[0]= $user_agent="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; $header[]="Cookie:CookieName1=Value;"; $header[]="Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; $header[]="Cache-Control: max-age=0"; $header[]="Connection: keep-alive"; $header[]="Keep-Alive: 300"; $header[]="Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; $header[] = "Accept-Language: en-us,en;q=0.5"; $header[] = "Pragma: "; curl_setopt($c, CURLOPT_HEADER, true); curl_setopt($c, CURLOPT_HTTPHEADER, $header); | |
curl_setopt($c, CURLOPT_MAXREDIRS, 10); | |
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so... | |
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);} | |
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9); | |
curl_setopt($c, CURLOPT_REFERER, $url); | |
curl_setopt($c, CURLOPT_TIMEOUT, 60); | |
curl_setopt($c, CURLOPT_AUTOREFERER, true); | |
curl_setopt($c, CURLOPT_ENCODING, 'gzip,deflate'); | |
$data=curl_exec($c);$status=curl_getinfo($c);curl_close($c); | |
preg_match('/(http(|s)):\/\/(.*?)\/(.*\/|)/si', $status['url'],$link); | |
//correct assets URLs(i.e. retrieved url is: http://example.com/DIR/SUBDIR/page.html... then href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG", but href="/image.JPG" needs to become href="http://example.com/image.JPG") | |
//inside all links(except starting with HTTP,javascript:,HTTPS,//,/ ) insert that current DIRECTORY url (href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG") | |
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/|\/)).*?)(\'|\")/si','$1=$2'.$link[0].'$3$4$5', $data); | |
//inside all links(except starting with HTTP,javascript:,HTTPS,//) insert that DOMAIN url (href="/image.JPG" becomes href="http://example.com/image.JPG") | |
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/)).*?)(\'|\")/si','$1=$2'.$link[1].'://'.$link[3].'$3$4$5', $data); | |
// if redirected, then get that redirected page | |
if($status['http_code']==301 || $status['http_code']==302) { | |
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL | |
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :) | |
if (!$follow_allowed){ | |
//if REDIRECT URL is found in HEADER | |
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}} | |
//if REDIRECT URL is found in RESPONSE | |
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } } | |
//if REDIRECT URL is found in OUTPUT | |
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } } | |
//if URL found, then re-use this function again, for the found url | |
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);} | |
} | |
} | |
// if not redirected,and nor "status 200" page, then error.. | |
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";} | |
return ( $return_full_array ? array('data'=>$data,'info'=>$status) : $data); | |
} | |
echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">"; | |
$data = get_remote_data("http://www.o-xe.com/"); | |
//echo $data; | |
preg_match_all("|<td\s+style='background-image\s*:\s+url\(\/images\/icons\/([^\.]+).png\);background-repeat\s*:\s+no-repeat;background-position\s*:\s+left\s+center;cursor:pointer'\s+\s+class='MonetaryText'\s+title='([^']*)'\s+id='([^']*)'><b>([^<]*)<\/b><\/td><td\s+align='center'\s+class='MonetaryBuy'>([0-9,]*)<\/td><td\s+align='center'\s+class='MonetarySell'>([0-9,]*)<\/td>|isU", | |
$data, | |
$outs, | |
PREG_SET_ORDER); | |
//echo $data; | |
foreach ($outs as $out){ | |
echo $out[1] . " ".$out[2] . " ".$out[3] . " ".$out[4] . " ".$out[5] . " ".$out[6] . " ". "\n"; | |
} | |
//print_r($outs); | |
echo "</textarea>"; | |
echo "<p>Finnished</p>"; | |
?> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
set_time_limit(0); | |
ini_set('max_execution_time', 0); | |
function get_remote_data($url, $post_paramtrs=false, $return_full_array=false) { | |
$c = curl_init();curl_setopt($c, CURLOPT_URL, $url); | |
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1); | |
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL) | |
if($post_paramtrs){curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, "var1=bla&".$post_paramtrs );} | |
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false); | |
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false); | |
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"); | |
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;'); | |
//We'd better to use the above command, because the following command gave some weird STATUS results.. | |
$header[0]= $user_agent="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; $header[]="Cookie:CookieName1=Value;"; $header[]="Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; $header[]="Cache-Control: max-age=0"; $header[]="Connection: keep-alive"; $header[]="Keep-Alive: 300"; $header[]="Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; $header[] = "Accept-Language: en-us,en;q=0.5"; $header[] = "Pragma: "; curl_setopt($c, CURLOPT_HEADER, true); curl_setopt($c, CURLOPT_HTTPHEADER, $header); | |
curl_setopt($c, CURLOPT_MAXREDIRS, 10); | |
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so... | |
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);} | |
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9); | |
curl_setopt($c, CURLOPT_REFERER, $url); | |
curl_setopt($c, CURLOPT_TIMEOUT, 60); | |
curl_setopt($c, CURLOPT_AUTOREFERER, true); | |
curl_setopt($c, CURLOPT_ENCODING, 'gzip,deflate'); | |
$data=curl_exec($c);$status=curl_getinfo($c);curl_close($c); | |
preg_match('/(http(|s)):\/\/(.*?)\/(.*\/|)/si', $status['url'],$link); | |
//correct assets URLs(i.e. retrieved url is: http://example.com/DIR/SUBDIR/page.html... then href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG", but href="/image.JPG" needs to become href="http://example.com/image.JPG") | |
//inside all links(except starting with HTTP,javascript:,HTTPS,//,/ ) insert that current DIRECTORY url (href="./image.JPG" becomes href="http://example.com/DIR/SUBDIR/image.JPG") | |
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/|\/)).*?)(\'|\")/si','$1=$2'.$link[0].'$3$4$5', $data); | |
//inside all links(except starting with HTTP,javascript:,HTTPS,//) insert that DOMAIN url (href="/image.JPG" becomes href="http://example.com/image.JPG") | |
$data=preg_replace('/(src|href|action)=(\'|\")((?!(http|https|javascript:|\/\/)).*?)(\'|\")/si','$1=$2'.$link[1].'://'.$link[3].'$3$4$5', $data); | |
// if redirected, then get that redirected page | |
if($status['http_code']==301 || $status['http_code']==302) { | |
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL | |
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :) | |
if (!$follow_allowed){ | |
//if REDIRECT URL is found in HEADER | |
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}} | |
//if REDIRECT URL is found in RESPONSE | |
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } } | |
//if REDIRECT URL is found in OUTPUT | |
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } } | |
//if URL found, then re-use this function again, for the found url | |
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);} | |
} | |
} | |
// if not redirected,and nor "status 200" page, then error.. | |
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";} | |
return ( $return_full_array ? array('data'=>$data,'info'=>$status) : $data); | |
} | |
echo "<textarea"." style=".'"margin: 0px; width: 776px; height: 580px;"'.">"; | |
for ($i=1 ; $i <= 50 ; $i++) { | |
if($i == 1){ | |
$data = get_remote_data("https://www.dotproperty.co.th/en/properties-for-rent"); | |
}else{ | |
$data = get_remote_data("https://www.dotproperty.co.th/en/properties-for-rent?page=$i"); | |
} | |
preg_match_all('|<div\s*class="wrapper">\s*<div\s*class="left-block">\s*<a\s*target="?_blank"?\s*data-tracking="[^"]*"\s*href="(https:\/\/www\.dotproperty\.co\.th\/en\/[^"]*)"\s*title="([^"]*)"\s*onmousedown="[^"]*">|isU', | |
$data, | |
$outs, PREG_SET_ORDER); | |
foreach ($outs as $out){ | |
echo $out[1] . "\n"; | |
} | |
} | |
echo "</textarea>"; | |
echo "<p>Finnished</p>"; | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment