Skip to content

Instantly share code, notes, and snippets.

@peterjaap
Last active February 3, 2022 21:23
Show Gist options
  • Save peterjaap/b9b56445a56e716bb53c978acecf7256 to your computer and use it in GitHub Desktop.
Save peterjaap/b9b56445a56e716bb53c978acecf7256 to your computer and use it in GitHub Desktop.
Sitemap Crawler coded during live session at Virtual Magento Meetup 30-04-2020
<?php declare(strict_types=1);
namespace VirtualMeetup\SitemapCrawler\Console\Command;
use Graze\ParallelProcess\Event\RunEvent;
use Graze\ParallelProcess\PriorityPool;
use Graze\ParallelProcess\RunInterface;
use Magento\Framework\Filesystem\Driver\File;
use Magento\Framework\HTTP\Client\Curl;
use Magento\Framework\Xml\Parser;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Helper\ProgressBar;
use Symfony\Component\Console\Helper\ProgressBarFactory;
use Symfony\Component\Console\Helper\Table;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Process\Process;
class Crawl extends Command
{
/**
* @var Curl
*/
public $curl;
/**
* @var File
*/
public $file;
/**
* @var Parser
*/
public $xmlParser;
/**
* Crawl constructor.
* @param Curl $curl
* @param File $file
* @param Parser $xmlParser
* @param string|null $name
*/
public function __construct(Curl $curl, File $file, Parser $xmlParser, string $name = null)
{
parent::__construct($name);
$this->curl = $curl;
$this->file = $file;
$this->xmlParser = $xmlParser;
}
/**
* {@inheritdoc}
*/
protected function execute(
InputInterface $input,
OutputInterface $output
) {
$target = $input->getArgument('target');
$format = $input->getOption('format');
if (filter_var($target, FILTER_VALIDATE_URL)) {
$this->curl->get($target);
$statusCode = $this->curl->getStatus();
$data = ['url' => $target, 'status_code' => $statusCode];
if ($format === 'text') {
$output->writeln('Status code of ' . $target . ' is ' . $statusCode);
} elseif($format === 'json') {
$output->write(json_encode($data));
} elseif($format === 'table') {
$table = new Table($output);
$table->setHeaders(['URL','Status code']);
$table->setHeaderTitle('Sitemap crawl results');
$table->addRow($data);
$table->render();
}
} elseif($this->file->isFile($target)) {
$urls = $this->fetchUrlsFromSitemap($target);
$urls = array_slice($urls, 0, 16);
$section = $output->section();
$table = new Table($section);
$table->setHeaders(['URL','Status code']);
$table->setHeaderTitle('Sitemap crawl results');
$table->render();
$section = $output->section();
$progressBar = new ProgressBar($section, count($urls));
$progressBar->start();
$pool = new PriorityPool();
$pool->setMaxSimultaneous(4);
foreach ($urls as $url) {
$process = new Process(['php', 'bin/magento', 'sitemap:crawl', $url, '--format=json']);
$pool->add($process);
}
array_map(function ($process) use ($table, $progressBar) {
/** @var $process RunEvent */
$process->addListener(RunEvent::SUCCESSFUL, function (RunEvent $event) use ($table, $progressBar) {
$data = json_decode($event->getRun()->getLastMessage(), true);
$table->appendRow($data);
$progressBar->advance();
});
}, $pool->getAll());
$pool->run();
} else {
$output->writeln('<error>Given file does not exist</>');
}
}
/**
* {@inheritdoc}
*/
protected function configure()
{
$this->setName('sitemap:crawl');
$this->setDescription('Sitemap crawler');
$this->setDefinition([
new InputArgument('target', InputArgument::OPTIONAL, 'Target'),
new InputOption('format', '-f', InputOption::VALUE_OPTIONAL, 'Output format', 'text')
]);
parent::configure();
}
private function fetchUrlsFromSitemap(?string $target)
{
$content = $this->xmlParser->load($target)->xmlToArray();
return array_column($content['urlset']['url'], 'loc');
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment