scrapingace · September 1, 2024 22:48
diff --git a/books_scraper.php b/books_scraper.php
 <?php
  require 'vendor/autoload.php';
  use GuzzleHttp\Client;

  class BooksScraper {
    function __construct() {
      //setup base_uri
      $this->base_uri = 'https://books.toscrape.com/';

      // create Guzzle HTTP client
      $this->client = new Client([
        'base_uri' => $this->base_uri,
        'timeout'  => 300.0,
      ]);
    }
    
    function run() {
      $this->load_html(); // Load HTML from URL
      $this->load_dom(); // Load HTML to DOMDocument & DOMXpath to start reading nodes
      $this->scrape(); // Scrape data from nodes as required
    }

    private function load_html() {
      $response = $this->client->get('/');
      $this->html = $response->getBody()->getContents();
    }

    private function load_dom() {
      // throw Exception if no HTML content.
      if ( !$this->html ) { throw new Exception('No HTML content.'); }

      $this->doc = new DOMDocument;
      @$this->doc->loadHTML($this->html);
      $this->xpath = new DOMXpath($this->doc);
    }

    private function scrape() {
      // Identify all book nodes
      $elements = $this->xpath->query("//ol[@class='row']//li//article");
      if ($elements->length == 0) {
        throw new Exception('Elements not present for scraping.');
      }

      // Loop through each book node and find book data,
      // then store data to $data array
      $data = array ();
      foreach ($elements as $key => $element) {
        $item = $this->parse_node( $element );
        array_push ( $data, $item );
      }

      // Write $data to csv
      $this->to_csv($data);
    }

    private function parse_node($element) {
      $item                 = array ();
      $item['image_path']   = $this->base_uri . $this->extract(".//div[@class='image_container']//a//img/@src", $element);
      $item['title']        = $this->extract(".//h3//a", $element);
      $item['price']        = $this->extract(".//div[@class='product_price']//p[@class='price_color']", $element);
      $item['availability'] = $this->extract(".//div[@class='product_price']//p[@class='instock availability']", $element);
      $item['details_link'] = $this->base_uri . $this->extract(".//h3//a/@href", $element);

      return $item;
    }

    private function extract($node, $element) {
      // Get node text
      $value = $this->xpath->query($node, $element)->item(0)->nodeValue;

      return trim($value);
    }

    private function to_csv($data) {
      $file = fopen ( './result.csv', 'a' );

      // write headers
      fputcsv ( $file, ['image_path', 'title', 'price', 'availability', 'details_link'] );

      // write books data
      foreach ($data as $item) { fputcsv ( $file, $item ); }
      fclose ( $file );
    }
  }
  
  $scraper = new BooksScraper();
  $scraper->run();
  echo 'Success!';
 ?>
	<?php
	require 'vendor/autoload.php';
	use GuzzleHttp\Client;

	class BooksScraper {
	function __construct() {
	//setup base_uri
	$this->base_uri = 'https://books.toscrape.com/';

	// create Guzzle HTTP client
	$this->client = new Client([
	'base_uri' => $this->base_uri,
	'timeout' => 300.0,
	]);
	}

	function run() {
	$this->load_html(); // Load HTML from URL
	$this->load_dom(); // Load HTML to DOMDocument & DOMXpath to start reading nodes
	$this->scrape(); // Scrape data from nodes as required
	}

	private function load_html() {
	$response = $this->client->get('/');
	$this->html = $response->getBody()->getContents();
	}

	private function load_dom() {
	// throw Exception if no HTML content.
	if ( !$this->html ) { throw new Exception('No HTML content.'); }

	$this->doc = new DOMDocument;
	@$this->doc->loadHTML($this->html);
	$this->xpath = new DOMXpath($this->doc);
	}

	private function scrape() {
	// Identify all book nodes
	$elements = $this->xpath->query("//ol[@class='row']//li//article");
	if ($elements->length == 0) {
	throw new Exception('Elements not present for scraping.');
	}

	// Loop through each book node and find book data,
	// then store data to $data array
	$data = array ();
	foreach ($elements as $key => $element) {
	$item = $this->parse_node( $element );
	array_push ( $data, $item );
	}

	// Write $data to csv
	$this->to_csv($data);
	}

	private function parse_node($element) {
	$item = array ();
	$item['image_path'] = $this->base_uri . $this->extract(".//div[@class='image_container']//a//img/@src", $element);
	$item['title'] = $this->extract(".//h3//a", $element);
	$item['price'] = $this->extract(".//div[@class='product_price']//p[@class='price_color']", $element);
	$item['availability'] = $this->extract(".//div[@class='product_price']//p[@class='instock availability']", $element);
	$item['details_link'] = $this->base_uri . $this->extract(".//h3//a/@href", $element);

	return $item;
	}

	private function extract($node, $element) {
	// Get node text
	$value = $this->xpath->query($node, $element)->item(0)->nodeValue;

	return trim($value);
	}

	private function to_csv($data) {
	$file = fopen ( './result.csv', 'a' );

	// write headers
	fputcsv ( $file, ['image_path', 'title', 'price', 'availability', 'details_link'] );

	// write books data
	foreach ($data as $item) { fputcsv ( $file, $item ); }
	fclose ( $file );
	}
	}

	$scraper = new BooksScraper();
	$scraper->run();
	echo 'Success!';
	?>