Created
August 11, 2019 07:23
-
-
Save kaianuar/ca1c447dcbd891e57c53b7c0c61c3bbc to your computer and use it in GitHub Desktop.
Using spatie/crawler and crawl a page and retrieve all links within that page.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Http\Crawler; | |
use Spatie\Crawler\CrawlObserver; | |
use GuzzleHttp\Exception\RequestException; | |
use Psr\Http\Message\ResponseInterface; | |
use Psr\Http\Message\UriInterface; | |
use Symfony\Component\DomCrawler\Crawler as DomCrawler; | |
class CategoryHelper extends CrawlObserver{ | |
public $pages =[]; | |
protected $selector; | |
protected $url; | |
public function crawled(UriInterface $url, ResponseInterface $response, UriInterface $foundOnUrl = null) | |
{ | |
$this->url = $url->__toString(); | |
$crawler = new DomCrawler($response->getBody()->__toString()); | |
$crawler->filter($this->selector)->each(function (DomCrawler $node, $i) { | |
// this is where you add the urls to the array | |
$this->pages[] = $this->url . $node->attr('href'); | |
}); | |
} | |
public function crawlFailed(UriInterface $url, RequestException $requestException, UriInterface $foundOnUrl = null) | |
{ | |
echo 'failed'; | |
} | |
public function finishedCrawling() | |
{ | |
// You can use this to do whatever needs to be done once the crawling has finished. | |
// What I usually do here is to remove duplicate urls | |
} | |
public function setSelector($selector) | |
{ | |
$this->selector = $selector; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use Spatie\Crawler\Crawler; | |
use App\Http\Crawler\CategoryHelper; | |
class CrawlerController extends Controller | |
{ | |
public $root_url; | |
public function crawl($url = null) | |
{ | |
$this->root_url = 'https://www.amazon.com/'; | |
$cat_helper = new CategoryHelper(); | |
$cat_helper->setSelector('a.a-link-normal'); // Pass the css selector that contains the links | |
Crawler::create() | |
->setCrawlObserver($cat_helper) | |
->setMaximumCrawlCount(1) | |
->ignoreRobots() | |
->startCrawling($this->root_url); | |
$pages = $cat_helper->pages; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment