Skip to content

Instantly share code, notes, and snippets.

@mathewka
Created April 15, 2014 06:31
Show Gist options
  • Select an option

  • Save mathewka/10707321 to your computer and use it in GitHub Desktop.

Select an option

Save mathewka/10707321 to your computer and use it in GitHub Desktop.
Data Scraping - Read data from another website.
<?php
class WebContentReader{
public $global_url;
function __construct(){
$this->global_url = array();
// Create connection
$con=mysqli_connect("localhost","root","","dbname");
if (mysqli_connect_errno($con))
{
echo "Failed to connect to MySQL: " . mysqli_connect_error();
}
$url = "site url.)";
$page_contents = $this->getWebPage( $url );
print "page URL-----".$url."<br><br>";
if ( $page_contents['errno'] != 0 ){
print "error: bad url, timeout, redirect loop ...";
}
if ( $page_contents['http_code'] != 200 ){
print "error: no page, no permissions, no service ...";
}
/**
* LIMIT for Execute URL
**/
$contents = $page_contents['content'];
$dom = new DOMDocument;
@$dom->loadHTML($contents);
$xpath = new DomXPath($dom);
// $elements = $xpath->query("//ul[@class='cat-list']/li/ul[@class='subcat-list']/ul[@class='subcat-list']/li/a");
$elements = $xpath->query("element path");
if (!is_null($elements)) {
$count = 0;
foreach ($elements as $element) {
$product_url = $element->getAttribute('href');
if($product_url != '#' && (!in_array($product_url,$this->global_url ))){
$this->readDataFromUrl($product_url, $con);
}
array_push($this->global_url , $product_list_url);
$count++;
}
}
$con->close();
}
/**
clean strin special charactors
**/
function clean($string) {
$string = str_replace('', '-', $string); // Replaces all spaces with hyphens.
return preg_replace('/[^A-Za-z0-9\-]/', '', $string); // Removes special chars.
}
/**
Get a web file
**/
function getWebPage( $url )
{
$options = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "spider", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 240, // timeout on connect
CURLOPT_TIMEOUT => 240, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
$ch = curl_init( $url );
curl_setopt_array( $ch, $options );
$content = curl_exec( $ch );
$err = curl_errno( $ch );
$errmsg = curl_error( $ch );
$header = curl_getinfo( $ch );
curl_close( $ch );
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['content'] = $content;
return $header;
}
/**
read data from URL page
**/
function readDataFromUrl( $url, $con){
$page_contents = $this->getWebPage( $url );
// print "prodcut URL-----".$url."<br><br>";
if ( $page_contents['errno'] != 0 ){
print "error: bad url, timeout, redirect loop ...";
}
if ( $page_contents['http_code'] != 200 ){
print "error: no page, no permissions, no service ...";
}
$contents = $page_contents['content'];
$dom = new DOMDocument;
@$dom->loadHTML($contents);
$xpath = new DomXPath($dom);
/**
read image from anchor tag with class name called picture-link.
**/
$image_500 = "";
$elements = $xpath->query("//a[@class='picture-link']");
if (!is_null($elements)) {
foreach ($elements as $element) {
$image_500 = $element->getAttribute('href');
}
// print "<br>Enlarged image url is---".$image_500."<br>";
}
/**
read image from img tag
**/
$elements = $xpath->query("//a[@class='picture-link']/img");
$image_200 = "";
if (!is_null($elements)) {
foreach ($elements as $element) {
$image_200 = $element->getAttribute('src');
}
// print "<br>image_200 url is---".$image_200."<br>";
}
/**
read product name
**/
$elements = $xpath->query('//table[@id="product-attribute-specs-table"]//h3[@itemprop="name"]');
$product_name = "";
if (!is_null($elements)) {
foreach ($elements as $element) {
$product_name = $element->nodeValue;
// print "<br>product_name is---".$product_name."<br>";
}
}
/**
read product_description
**/
$product_description = "";
$elements = $xpath->query('//h2[@itemprop="description"]');
if (!is_null($elements)) {
foreach ($elements as $element) {
$product_description = $element->nodeValue;
// print "<br>product_name is---".$product_description."<br>";
}
}
$specification_array = array();
for($i=0; $i<count($specification_data_array);$i++){
$specification_array[$specification_label_array[$i]] = $specification_data_array[$i];
}
}
}
$obj = new WebContentReader();
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment