Created
April 15, 2014 06:31
-
-
Save mathewka/10707321 to your computer and use it in GitHub Desktop.
Data Scraping - Read data from another website.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| class WebContentReader{ | |
| public $global_url; | |
| function __construct(){ | |
| $this->global_url = array(); | |
| // Create connection | |
| $con=mysqli_connect("localhost","root","","dbname"); | |
| if (mysqli_connect_errno($con)) | |
| { | |
| echo "Failed to connect to MySQL: " . mysqli_connect_error(); | |
| } | |
| $url = "site url.)"; | |
| $page_contents = $this->getWebPage( $url ); | |
| print "page URL-----".$url."<br><br>"; | |
| if ( $page_contents['errno'] != 0 ){ | |
| print "error: bad url, timeout, redirect loop ..."; | |
| } | |
| if ( $page_contents['http_code'] != 200 ){ | |
| print "error: no page, no permissions, no service ..."; | |
| } | |
| /** | |
| * LIMIT for Execute URL | |
| **/ | |
| $contents = $page_contents['content']; | |
| $dom = new DOMDocument; | |
| @$dom->loadHTML($contents); | |
| $xpath = new DomXPath($dom); | |
| // $elements = $xpath->query("//ul[@class='cat-list']/li/ul[@class='subcat-list']/ul[@class='subcat-list']/li/a"); | |
| $elements = $xpath->query("element path"); | |
| if (!is_null($elements)) { | |
| $count = 0; | |
| foreach ($elements as $element) { | |
| $product_url = $element->getAttribute('href'); | |
| if($product_url != '#' && (!in_array($product_url,$this->global_url ))){ | |
| $this->readDataFromUrl($product_url, $con); | |
| } | |
| array_push($this->global_url , $product_list_url); | |
| $count++; | |
| } | |
| } | |
| $con->close(); | |
| } | |
| /** | |
| clean strin special charactors | |
| **/ | |
| function clean($string) { | |
| $string = str_replace('', '-', $string); // Replaces all spaces with hyphens. | |
| return preg_replace('/[^A-Za-z0-9\-]/', '', $string); // Removes special chars. | |
| } | |
| /** | |
| Get a web file | |
| **/ | |
| function getWebPage( $url ) | |
| { | |
| $options = array( | |
| CURLOPT_RETURNTRANSFER => true, // return web page | |
| CURLOPT_HEADER => false, // don't return headers | |
| CURLOPT_FOLLOWLOCATION => true, // follow redirects | |
| CURLOPT_ENCODING => "", // handle all encodings | |
| CURLOPT_USERAGENT => "spider", // who am i | |
| CURLOPT_AUTOREFERER => true, // set referer on redirect | |
| CURLOPT_CONNECTTIMEOUT => 240, // timeout on connect | |
| CURLOPT_TIMEOUT => 240, // timeout on response | |
| CURLOPT_MAXREDIRS => 10, // stop after 10 redirects | |
| ); | |
| $ch = curl_init( $url ); | |
| curl_setopt_array( $ch, $options ); | |
| $content = curl_exec( $ch ); | |
| $err = curl_errno( $ch ); | |
| $errmsg = curl_error( $ch ); | |
| $header = curl_getinfo( $ch ); | |
| curl_close( $ch ); | |
| $header['errno'] = $err; | |
| $header['errmsg'] = $errmsg; | |
| $header['content'] = $content; | |
| return $header; | |
| } | |
| /** | |
| read data from URL page | |
| **/ | |
| function readDataFromUrl( $url, $con){ | |
| $page_contents = $this->getWebPage( $url ); | |
| // print "prodcut URL-----".$url."<br><br>"; | |
| if ( $page_contents['errno'] != 0 ){ | |
| print "error: bad url, timeout, redirect loop ..."; | |
| } | |
| if ( $page_contents['http_code'] != 200 ){ | |
| print "error: no page, no permissions, no service ..."; | |
| } | |
| $contents = $page_contents['content']; | |
| $dom = new DOMDocument; | |
| @$dom->loadHTML($contents); | |
| $xpath = new DomXPath($dom); | |
| /** | |
| read image from anchor tag with class name called picture-link. | |
| **/ | |
| $image_500 = ""; | |
| $elements = $xpath->query("//a[@class='picture-link']"); | |
| if (!is_null($elements)) { | |
| foreach ($elements as $element) { | |
| $image_500 = $element->getAttribute('href'); | |
| } | |
| // print "<br>Enlarged image url is---".$image_500."<br>"; | |
| } | |
| /** | |
| read image from img tag | |
| **/ | |
| $elements = $xpath->query("//a[@class='picture-link']/img"); | |
| $image_200 = ""; | |
| if (!is_null($elements)) { | |
| foreach ($elements as $element) { | |
| $image_200 = $element->getAttribute('src'); | |
| } | |
| // print "<br>image_200 url is---".$image_200."<br>"; | |
| } | |
| /** | |
| read product name | |
| **/ | |
| $elements = $xpath->query('//table[@id="product-attribute-specs-table"]//h3[@itemprop="name"]'); | |
| $product_name = ""; | |
| if (!is_null($elements)) { | |
| foreach ($elements as $element) { | |
| $product_name = $element->nodeValue; | |
| // print "<br>product_name is---".$product_name."<br>"; | |
| } | |
| } | |
| /** | |
| read product_description | |
| **/ | |
| $product_description = ""; | |
| $elements = $xpath->query('//h2[@itemprop="description"]'); | |
| if (!is_null($elements)) { | |
| foreach ($elements as $element) { | |
| $product_description = $element->nodeValue; | |
| // print "<br>product_name is---".$product_description."<br>"; | |
| } | |
| } | |
| $specification_array = array(); | |
| for($i=0; $i<count($specification_data_array);$i++){ | |
| $specification_array[$specification_label_array[$i]] = $specification_data_array[$i]; | |
| } | |
| } | |
| } | |
| $obj = new WebContentReader(); | |
| ?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment