Last active
September 1, 2024 22:48
-
-
Save scrapingace/72d35d3f813c23482bd361cacd61be9c to your computer and use it in GitHub Desktop.
Simple PHP Scraper Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require 'vendor/autoload.php'; | |
use GuzzleHttp\Client; | |
class BooksScraper { | |
function __construct() { | |
//setup base_uri | |
$this->base_uri = 'https://books.toscrape.com/'; | |
// create Guzzle HTTP client | |
$this->client = new Client([ | |
'base_uri' => $this->base_uri, | |
'timeout' => 300.0, | |
]); | |
} | |
function run() { | |
$this->load_html(); // Load HTML from URL | |
$this->load_dom(); // Load HTML to DOMDocument & DOMXpath to start reading nodes | |
$this->scrape(); // Scrape data from nodes as required | |
} | |
private function load_html() { | |
$response = $this->client->get('/'); | |
$this->html = $response->getBody()->getContents(); | |
} | |
private function load_dom() { | |
// throw Exception if no HTML content. | |
if ( !$this->html ) { throw new Exception('No HTML content.'); } | |
$this->doc = new DOMDocument; | |
@$this->doc->loadHTML($this->html); | |
$this->xpath = new DOMXpath($this->doc); | |
} | |
private function scrape() { | |
// Identify all book nodes | |
$elements = $this->xpath->query("//ol[@class='row']//li//article"); | |
if ($elements->length == 0) { | |
throw new Exception('Elements not present for scraping.'); | |
} | |
// Loop through each book node and find book data, | |
// then store data to $data array | |
$data = array (); | |
foreach ($elements as $key => $element) { | |
$item = $this->parse_node( $element ); | |
array_push ( $data, $item ); | |
} | |
// Write $data to csv | |
$this->to_csv($data); | |
} | |
private function parse_node($element) { | |
$item = array (); | |
$item['image_path'] = $this->base_uri . $this->extract(".//div[@class='image_container']//a//img/@src", $element); | |
$item['title'] = $this->extract(".//h3//a", $element); | |
$item['price'] = $this->extract(".//div[@class='product_price']//p[@class='price_color']", $element); | |
$item['availability'] = $this->extract(".//div[@class='product_price']//p[@class='instock availability']", $element); | |
$item['details_link'] = $this->base_uri . $this->extract(".//h3//a/@href", $element); | |
return $item; | |
} | |
private function extract($node, $element) { | |
// Get node text | |
$value = $this->xpath->query($node, $element)->item(0)->nodeValue; | |
return trim($value); | |
} | |
private function to_csv($data) { | |
$file = fopen ( './result.csv', 'a' ); | |
// write headers | |
fputcsv ( $file, ['image_path', 'title', 'price', 'availability', 'details_link'] ); | |
// write books data | |
foreach ($data as $item) { fputcsv ( $file, $item ); } | |
fclose ( $file ); | |
} | |
} | |
$scraper = new BooksScraper(); | |
$scraper->run(); | |
echo 'Success!'; | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment