kaianuar · August 11, 2019 07:23
diff --git a/CategoryHelper.php b/CategoryHelper.php
 <?php

 namespace App\Http\Crawler;

 use Spatie\Crawler\CrawlObserver;
 use GuzzleHttp\Exception\RequestException;
 use Psr\Http\Message\ResponseInterface;
 use Psr\Http\Message\UriInterface;
 use Symfony\Component\DomCrawler\Crawler as DomCrawler;

 class CategoryHelper extends CrawlObserver{

    public $pages =[];
    protected $selector;
    protected $url;

    public function crawled(UriInterface $url, ResponseInterface $response, UriInterface $foundOnUrl = null)
    {
        
        $this->url = $url->__toString();

        $crawler = new DomCrawler($response->getBody()->__toString());
        $crawler->filter($this->selector)->each(function (DomCrawler $node, $i) {
            // this is where you add the urls to the array
            $this->pages[] = $this->url . $node->attr('href');            
        });
    }
    
    public function crawlFailed(UriInterface $url, RequestException $requestException, UriInterface $foundOnUrl = null)
    {
        echo 'failed';
    }

    public function finishedCrawling()
    {
        // You can use this to do whatever needs to be done once the crawling has finished. 
        // What I usually do here is to remove duplicate urls
    }

    public function setSelector($selector)
    {
        $this->selector = $selector;
    }

 }    
diff --git a/CrawlerController.php b/CrawlerController.php
 <?php

 use Spatie\Crawler\Crawler;
 use App\Http\Crawler\CategoryHelper;

 class CrawlerController extends Controller
 {
    public $root_url;

    public function crawl($url = null)
    {
        $this->root_url = 'https://www.amazon.com/';

        $cat_helper = new CategoryHelper();
        $cat_helper->setSelector('a.a-link-normal'); // Pass the css selector that contains the links

        Crawler::create()
        ->setCrawlObserver($cat_helper)
        ->setMaximumCrawlCount(1)
        ->ignoreRobots()
        ->startCrawling($this->root_url); 
        
        $pages = $cat_helper->pages;

    }
 }
	<?php

	namespace App\Http\Crawler;

	use Spatie\Crawler\CrawlObserver;
	use GuzzleHttp\Exception\RequestException;
	use Psr\Http\Message\ResponseInterface;
	use Psr\Http\Message\UriInterface;
	use Symfony\Component\DomCrawler\Crawler as DomCrawler;

	class CategoryHelper extends CrawlObserver{

	public $pages =[];
	protected $selector;
	protected $url;

	public function crawled(UriInterface $url, ResponseInterface $response, UriInterface $foundOnUrl = null)
	{

	$this->url = $url->__toString();

	$crawler = new DomCrawler($response->getBody()->__toString());
	$crawler->filter($this->selector)->each(function (DomCrawler $node, $i) {
	// this is where you add the urls to the array
	$this->pages[] = $this->url . $node->attr('href');
	});
	}

	public function crawlFailed(UriInterface $url, RequestException $requestException, UriInterface $foundOnUrl = null)
	{
	echo 'failed';
	}

	public function finishedCrawling()
	{
	// You can use this to do whatever needs to be done once the crawling has finished.
	// What I usually do here is to remove duplicate urls
	}

	public function setSelector($selector)
	{
	$this->selector = $selector;
	}

	}
	<?php

	use Spatie\Crawler\Crawler;
	use App\Http\Crawler\CategoryHelper;

	class CrawlerController extends Controller
	{
	public $root_url;

	public function crawl($url = null)
	{
	$this->root_url = 'https://www.amazon.com/';

	$cat_helper = new CategoryHelper();
	$cat_helper->setSelector('a.a-link-normal'); // Pass the css selector that contains the links

	Crawler::create()
	->setCrawlObserver($cat_helper)
	->setMaximumCrawlCount(1)
	->ignoreRobots()
	->startCrawling($this->root_url);

	$pages = $cat_helper->pages;

	}
	}