Last active
February 3, 2022 21:23
-
-
Save peterjaap/b9b56445a56e716bb53c978acecf7256 to your computer and use it in GitHub Desktop.
Sitemap Crawler coded during live session at Virtual Magento Meetup 30-04-2020
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php declare(strict_types=1); | |
namespace VirtualMeetup\SitemapCrawler\Console\Command; | |
use Graze\ParallelProcess\Event\RunEvent; | |
use Graze\ParallelProcess\PriorityPool; | |
use Graze\ParallelProcess\RunInterface; | |
use Magento\Framework\Filesystem\Driver\File; | |
use Magento\Framework\HTTP\Client\Curl; | |
use Magento\Framework\Xml\Parser; | |
use Symfony\Component\Console\Command\Command; | |
use Symfony\Component\Console\Helper\ProgressBar; | |
use Symfony\Component\Console\Helper\ProgressBarFactory; | |
use Symfony\Component\Console\Helper\Table; | |
use Symfony\Component\Console\Input\InputArgument; | |
use Symfony\Component\Console\Input\InputInterface; | |
use Symfony\Component\Console\Input\InputOption; | |
use Symfony\Component\Console\Output\OutputInterface; | |
use Symfony\Component\Process\Process; | |
class Crawl extends Command | |
{ | |
/** | |
* @var Curl | |
*/ | |
public $curl; | |
/** | |
* @var File | |
*/ | |
public $file; | |
/** | |
* @var Parser | |
*/ | |
public $xmlParser; | |
/** | |
* Crawl constructor. | |
* @param Curl $curl | |
* @param File $file | |
* @param Parser $xmlParser | |
* @param string|null $name | |
*/ | |
public function __construct(Curl $curl, File $file, Parser $xmlParser, string $name = null) | |
{ | |
parent::__construct($name); | |
$this->curl = $curl; | |
$this->file = $file; | |
$this->xmlParser = $xmlParser; | |
} | |
/** | |
* {@inheritdoc} | |
*/ | |
protected function execute( | |
InputInterface $input, | |
OutputInterface $output | |
) { | |
$target = $input->getArgument('target'); | |
$format = $input->getOption('format'); | |
if (filter_var($target, FILTER_VALIDATE_URL)) { | |
$this->curl->get($target); | |
$statusCode = $this->curl->getStatus(); | |
$data = ['url' => $target, 'status_code' => $statusCode]; | |
if ($format === 'text') { | |
$output->writeln('Status code of ' . $target . ' is ' . $statusCode); | |
} elseif($format === 'json') { | |
$output->write(json_encode($data)); | |
} elseif($format === 'table') { | |
$table = new Table($output); | |
$table->setHeaders(['URL','Status code']); | |
$table->setHeaderTitle('Sitemap crawl results'); | |
$table->addRow($data); | |
$table->render(); | |
} | |
} elseif($this->file->isFile($target)) { | |
$urls = $this->fetchUrlsFromSitemap($target); | |
$urls = array_slice($urls, 0, 16); | |
$section = $output->section(); | |
$table = new Table($section); | |
$table->setHeaders(['URL','Status code']); | |
$table->setHeaderTitle('Sitemap crawl results'); | |
$table->render(); | |
$section = $output->section(); | |
$progressBar = new ProgressBar($section, count($urls)); | |
$progressBar->start(); | |
$pool = new PriorityPool(); | |
$pool->setMaxSimultaneous(4); | |
foreach ($urls as $url) { | |
$process = new Process(['php', 'bin/magento', 'sitemap:crawl', $url, '--format=json']); | |
$pool->add($process); | |
} | |
array_map(function ($process) use ($table, $progressBar) { | |
/** @var $process RunEvent */ | |
$process->addListener(RunEvent::SUCCESSFUL, function (RunEvent $event) use ($table, $progressBar) { | |
$data = json_decode($event->getRun()->getLastMessage(), true); | |
$table->appendRow($data); | |
$progressBar->advance(); | |
}); | |
}, $pool->getAll()); | |
$pool->run(); | |
} else { | |
$output->writeln('<error>Given file does not exist</>'); | |
} | |
} | |
/** | |
* {@inheritdoc} | |
*/ | |
protected function configure() | |
{ | |
$this->setName('sitemap:crawl'); | |
$this->setDescription('Sitemap crawler'); | |
$this->setDefinition([ | |
new InputArgument('target', InputArgument::OPTIONAL, 'Target'), | |
new InputOption('format', '-f', InputOption::VALUE_OPTIONAL, 'Output format', 'text') | |
]); | |
parent::configure(); | |
} | |
private function fetchUrlsFromSitemap(?string $target) | |
{ | |
$content = $this->xmlParser->load($target)->xmlToArray(); | |
return array_column($content['urlset']['url'], 'loc'); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment