Skip to content

Instantly share code, notes, and snippets.

@aaronbauman
Created November 15, 2021 14:43
Show Gist options
  • Save aaronbauman/863c781f48572e644ca6b26d451653a6 to your computer and use it in GitHub Desktop.
Save aaronbauman/863c781f48572e644ca6b26d451653a6 to your computer and use it in GitHub Desktop.
Spatie\Crawler\CrawlQueue\CrawlQueue implementation to limit similar url paths without limiting depth
<?php
namespace MessageAgency\BackstopCrawler;
use GuzzleHttp\Psr7\Uri;
use Spatie\Crawler\CrawlQueue\ArrayCrawlQueue;
use Spatie\Crawler\CrawlUrl;
class BackstopCrawlQueue extends ArrayCrawlQueue {
protected $pathCount = [];
protected $limitSimilar = 3;
// Ignore query string.
public function has($crawlUrl): bool {
$parentReturn = parent::has($crawlUrl);
if ($parentReturn) {
return TRUE;
}
/** @var \GuzzleHttp\Psr7\Uri $url */
$url = $crawlUrl;
if ($crawlUrl instanceof CrawlUrl) {
$url = $crawlUrl->url;
}
if (strpos($url, '?') !== FALSE) {
$tok = strtok($url, '?');
// echo PHP_EOL . 'original: ' . (string)$url . PHP_EOL;
// echo 'parsed: ' . $tok . PHP_EOL;
if (parent::has(new Uri($tok))) {
return TRUE;
}
}
// Passed query string check, now check for similar path.
$path = trim($url->getPath(), '/');
// Always include top level urls.
if (strpos($path, '/') === FALSE) {
return FALSE;
}
// Grab the bit of path from after the final "/", so that we consider URLs
// similar based on their "base directory" prefix from N-1.
$stringpos = strrpos($path, '/', -1);
$baseDir = substr($path, 0, $stringpos);
// echo PHP_EOL . "path: " . $path . " base dir: " . $baseDir . PHP_EOL;
if (empty($this->pathCount[$baseDir])) {
$this->pathCount[$baseDir] = 1;
return FALSE;
}
elseif ($this->pathCount[$baseDir] < $this->limitSimilar) {
$this->pathCount[$baseDir]++;
return FALSE;
}
else {
return TRUE;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment