Skip to content

Instantly share code, notes, and snippets.

@iampersistent
Created June 6, 2012 08:39
Show Gist options
  • Save iampersistent/2880690 to your computer and use it in GitHub Desktop.
Save iampersistent/2880690 to your computer and use it in GitHub Desktop.
filters for testing for valid links in indexer
use CamelSpider\Entity\LinkFilter
class Indexer extends AbstractSpider
{
protected $linkFilters;
// these could added in a compiler pass in the DI
public function addLinkFilter(LinkFilter $filter)
{
$this->linkFilters[] = $filter;
}
protected function addLink(Link $link)
{
if (!$this->subscription->insideScope($link)) {
$this->logger(
'outside the scope'
. "\n"
. '['
. $this->subscription->getDomainString()
. "]\n["
. $link->get('href')
. ']'
,'info', 5);
return 0;
}
//Evita links inválidos
foreach ($this->linkFilters as $filter) {
if (!$filter->run($link)) {
$this->logger($filter->getLog());
return 0;
}
}
$this->logger('Check Cache for id:' . $link->getId('string'), 'info', 5);
//Evita duplicidade
if ($this->requests > 0 && $this->cache->isObject($link->getId('string'))) {
$this->logger('cached', 'info', 5);
$this->cached++;
return 0;
}
$this->pool->save($link);
return 1;
}
}
Example Filter:
class IsDocumentLink implements LinkFilter
{
protected $log;
public function run(Link $link)
{
if (!SpiderAsserts::isDocumentLink($link)) {
$this->setLog('Href refused', 'info', 5);
return false;
}
return true;
}
public function setLog($message, $level, $id)
{
$this->log = array($message, $level, $id);
}
public function getLog()
{
return $this->log;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment