mathewka · April 15, 2014 06:31
diff --git a/website-content-reader.php b/website-content-reader.php
 <?php
 class WebContentReader{
 public $global_url;
 function __construct(){
 $this->global_url = array();
 // Create connection
 $con=mysqli_connect("localhost","root","","dbname");
 if (mysqli_connect_errno($con))
 {
  echo "Failed to connect to MySQL: " . mysqli_connect_error();
 }
 $url = "site url.)";
 $page_contents = $this->getWebPage( $url );
 print "page URL-----".$url."<br><br>";
 if ( $page_contents['errno'] != 0 ){
  print    "error: bad url, timeout, redirect loop ...";
 }
 if ( $page_contents['http_code'] != 200 ){
  print   "error: no page, no permissions, no service ...";
 }
 /**
 * LIMIT for Execute URL
 **/
 $contents = $page_contents['content'];
 $dom = new DOMDocument;
 @$dom->loadHTML($contents);
 $xpath = new DomXPath($dom);
 // $elements = $xpath->query("//ul[@class='cat-list']/li/ul[@class='subcat-list']/ul[@class='subcat-list']/li/a");
 $elements = $xpath->query("element path");
 if (!is_null($elements)) {
 $count = 0;
  foreach ($elements as $element) {
     $product_url = $element->getAttribute('href');
 	if($product_url != '#' && (!in_array($product_url,$this->global_url ))){
 	   $this->readDataFromUrl($product_url, $con);
 	 }
 	 array_push($this->global_url , $product_list_url);
 	$count++;
  }

 }
 $con->close();
 }
 /**
 clean strin special charactors
 **/
 function clean($string) {
   $string = str_replace('', '-', $string); // Replaces all spaces with hyphens.
   return preg_replace('/[^A-Za-z0-9\-]/', '', $string); // Removes special chars.
 }
 /**
 Get a web file
 **/
 function getWebPage( $url )
 {
    $options = array(
        CURLOPT_RETURNTRANSFER => true,     // return web page
        CURLOPT_HEADER         => false,    // don't return headers
        CURLOPT_FOLLOWLOCATION => true,     // follow redirects
        CURLOPT_ENCODING       => "",       // handle all encodings
        CURLOPT_USERAGENT      => "spider", // who am i
        CURLOPT_AUTOREFERER    => true,     // set referer on redirect
        CURLOPT_CONNECTTIMEOUT => 240,      // timeout on connect
        CURLOPT_TIMEOUT        => 240,      // timeout on response
        CURLOPT_MAXREDIRS      => 10,       // stop after 10 redirects
    );

    $ch      = curl_init( $url );
    curl_setopt_array( $ch, $options );
    $content = curl_exec( $ch );
    $err     = curl_errno( $ch );
    $errmsg  = curl_error( $ch );
    $header  = curl_getinfo( $ch );
    curl_close( $ch );

    $header['errno']   = $err;
    $header['errmsg']  = $errmsg;
    $header['content'] = $content;
    return $header;
 }


 /**
 read data from URL page
 **/
 function readDataFromUrl( $url, $con){
 $page_contents = $this->getWebPage( $url );
 // print "prodcut URL-----".$url."<br><br>";
 if ( $page_contents['errno'] != 0 ){
  print    "error: bad url, timeout, redirect loop ...";
 }
 if ( $page_contents['http_code'] != 200 ){
  print   "error: no page, no permissions, no service ...";
 }
 $contents = $page_contents['content'];
 $dom = new DOMDocument;
 @$dom->loadHTML($contents);
 $xpath = new DomXPath($dom);
 /**
 read image from anchor tag with class name called picture-link.
 **/
 $image_500 = "";
 $elements = $xpath->query("//a[@class='picture-link']");
 if (!is_null($elements)) {
  foreach ($elements as $element) {
 	$image_500 = $element->getAttribute('href');
  }
  // print "<br>Enlarged image url is---".$image_500."<br>";
 }
 /**
  read image from img tag 
 **/
 $elements = $xpath->query("//a[@class='picture-link']/img");
 $image_200 = "";
 if (!is_null($elements)) {
  foreach ($elements as $element) {
   $image_200 = $element->getAttribute('src');
  }
  // print "<br>image_200 url is---".$image_200."<br>";
 }
 /**
 read product name 
 **/
 $elements = $xpath->query('//table[@id="product-attribute-specs-table"]//h3[@itemprop="name"]');
 $product_name = "";
 if (!is_null($elements)) {
  foreach ($elements as $element) {
   $product_name = $element->nodeValue;
   // print "<br>product_name  is---".$product_name."<br>";
  }
 }
 /**
 read  product_description 
 **/
 $product_description = "";
 $elements = $xpath->query('//h2[@itemprop="description"]');
 if (!is_null($elements)) {
  foreach ($elements as $element) {
   $product_description = $element->nodeValue;
   // print "<br>product_name  is---".$product_description."<br>";
  }
 }
 $specification_array = array();
 for($i=0; $i<count($specification_data_array);$i++){
 $specification_array[$specification_label_array[$i]] = $specification_data_array[$i];
 }
 
 }

 }
 $obj = new WebContentReader();
 ?>
	<?php
	class WebContentReader{
	public $global_url;
	function __construct(){
	$this->global_url = array();
	// Create connection
	$con=mysqli_connect("localhost","root","","dbname");
	if (mysqli_connect_errno($con))
	{
	echo "Failed to connect to MySQL: " . mysqli_connect_error();
	}
	$url = "site url.)";
	$page_contents = $this->getWebPage( $url );
	print "page URL-----".$url."<br><br>";
	if ( $page_contents['errno'] != 0 ){
	print "error: bad url, timeout, redirect loop ...";
	}
	if ( $page_contents['http_code'] != 200 ){
	print "error: no page, no permissions, no service ...";
	}
	/**
	* LIMIT for Execute URL
	**/
	$contents = $page_contents['content'];
	$dom = new DOMDocument;
	@$dom->loadHTML($contents);
	$xpath = new DomXPath($dom);
	// $elements = $xpath->query("//ul[@class='cat-list']/li/ul[@class='subcat-list']/ul[@class='subcat-list']/li/a");
	$elements = $xpath->query("element path");
	if (!is_null($elements)) {
	$count = 0;
	foreach ($elements as $element) {
	$product_url = $element->getAttribute('href');
	if($product_url != '#' && (!in_array($product_url,$this->global_url ))){
	$this->readDataFromUrl($product_url, $con);
	}
	array_push($this->global_url , $product_list_url);
	$count++;
	}

	}
	$con->close();
	}
	/**
	clean strin special charactors
	**/
	function clean($string) {
	$string = str_replace('', '-', $string); // Replaces all spaces with hyphens.
	return preg_replace('/[^A-Za-z0-9\-]/', '', $string); // Removes special chars.
	}
	/**
	Get a web file
	**/
	function getWebPage( $url )
	{
	$options = array(
	CURLOPT_RETURNTRANSFER => true, // return web page
	CURLOPT_HEADER => false, // don't return headers
	CURLOPT_FOLLOWLOCATION => true, // follow redirects
	CURLOPT_ENCODING => "", // handle all encodings
	CURLOPT_USERAGENT => "spider", // who am i
	CURLOPT_AUTOREFERER => true, // set referer on redirect
	CURLOPT_CONNECTTIMEOUT => 240, // timeout on connect
	CURLOPT_TIMEOUT => 240, // timeout on response
	CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
	);

	$ch = curl_init( $url );
	curl_setopt_array( $ch, $options );
	$content = curl_exec( $ch );
	$err = curl_errno( $ch );
	$errmsg = curl_error( $ch );
	$header = curl_getinfo( $ch );
	curl_close( $ch );

	$header['errno'] = $err;
	$header['errmsg'] = $errmsg;
	$header['content'] = $content;
	return $header;
	}


	/**
	read data from URL page
	**/
	function readDataFromUrl( $url, $con){
	$page_contents = $this->getWebPage( $url );
	// print "prodcut URL-----".$url."<br><br>";
	if ( $page_contents['errno'] != 0 ){
	print "error: bad url, timeout, redirect loop ...";
	}
	if ( $page_contents['http_code'] != 200 ){
	print "error: no page, no permissions, no service ...";
	}
	$contents = $page_contents['content'];
	$dom = new DOMDocument;
	@$dom->loadHTML($contents);
	$xpath = new DomXPath($dom);
	/**
	read image from anchor tag with class name called picture-link.
	**/
	$image_500 = "";
	$elements = $xpath->query("//a[@class='picture-link']");
	if (!is_null($elements)) {
	foreach ($elements as $element) {
	$image_500 = $element->getAttribute('href');
	}
	// print "<br>Enlarged image url is---".$image_500."<br>";
	}
	/**
	read image from img tag
	**/
	$elements = $xpath->query("//a[@class='picture-link']/img");
	$image_200 = "";
	if (!is_null($elements)) {
	foreach ($elements as $element) {
	$image_200 = $element->getAttribute('src');
	}
	// print "<br>image_200 url is---".$image_200."<br>";
	}
	/**
	read product name
	**/
	$elements = $xpath->query('//table[@id="product-attribute-specs-table"]//h3[@itemprop="name"]');
	$product_name = "";
	if (!is_null($elements)) {
	foreach ($elements as $element) {
	$product_name = $element->nodeValue;
	// print "<br>product_name is---".$product_name."<br>";
	}
	}
	/**
	read product_description
	**/
	$product_description = "";
	$elements = $xpath->query('//h2[@itemprop="description"]');
	if (!is_null($elements)) {
	foreach ($elements as $element) {
	$product_description = $element->nodeValue;
	// print "<br>product_name is---".$product_description."<br>";
	}
	}
	$specification_array = array();
	for($i=0; $i<count($specification_data_array);$i++){
	$specification_array[$specification_label_array[$i]] = $specification_data_array[$i];
	}

	}

	}
	$obj = new WebContentReader();
	?>
No results found