Skip to content

Instantly share code, notes, and snippets.

@halfer
Last active April 5, 2017 19:39
Show Gist options
  • Save halfer/28b1875c7f1e9fc884ba9f2102ba91db to your computer and use it in GitHub Desktop.
Save halfer/28b1875c7f1e9fc884ba9f2102ba91db to your computer and use it in GitHub Desktop.
Script to examine bugs and behaviours in spatie/crawler (for 1.3, shows that I should not be doing my own de-duping!)
<?php
namespace Proximate;
use GuzzleHttp\Client;
use GuzzleHttp\RequestOptions;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\Url;
use Spatie\Crawler\CrawlObserver;
#use Spatie\Crawler\CrawlInternalUrls;
require 'vendor/autoload.php';
$url =
$baseUrl =
'http://ilovephp.jondh.me.uk/';
// @todo We need to add a Guzzle plugin into the client, to make curl/header changes
$client = new Client([
RequestOptions::COOKIES => true,
RequestOptions::CONNECT_TIMEOUT => 10,
RequestOptions::TIMEOUT => 10,
RequestOptions::ALLOW_REDIRECTS => true,
]);
class MyCrawlObserver implements CrawlObserver
{
public function willCrawl(Url $url)
{
}
public function hasBeenCrawled(Url $url, $response, Url $foundOnUrl = null)
{
echo sprintf("Crawled URL: %s\n", $url->path());
}
public function finishedCrawling()
{
}
}
class CrawlInternalUrls implements \Spatie\Crawler\CrawlProfile
{
protected $host = '';
public function __construct(string $baseUrl)
{
$this->host = parse_url($baseUrl, PHP_URL_HOST);
}
public function shouldCrawl(Url $url): bool
{
return $this->host === $url->host;
}
}
// @todo Add regex crawl logic here (in shouldCrawl())
class MyCrawlProfile extends CrawlInternalUrls
{
protected $visited = [];
public function shouldCrawl(Url $url) : bool
{
$isInternal = parent::shouldCrawl($url);
// @todo This needs to be generalised
$matchesRegex = strpos($url->path(), '/en/tutorial') === 0;
$matchesRoot = $url->path() === '/';
// The crawler gets stuck if it can only visit a URL once
$hasVisited = $this->hasVisited($url->path());
#$hasVisited = false; // FIXME hack
// Mark as visited
$this->visited($url->path());
$shouldCrawl =
$isInternal &&
($matchesRegex || $matchesRoot) &&
!$hasVisited;
if ($shouldCrawl)
{
echo sprintf("Should crawl %s\n", $url->path());
}
return $shouldCrawl;
}
protected function visited($path)
{
if (!$this->hasVisited($path))
{
$this->visited[] = $path;
}
}
protected function hasVisited($path)
{
return in_array($path, $this->visited);
}
}
$crawler = new Crawler($client);
$crawler->
setCrawlProfile(new MyCrawlProfile($baseUrl))->
setCrawlObserver(new MyCrawlObserver())->
#setConcurrency(1)->
startCrawling($url);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment