Skip to content

Instantly share code, notes, and snippets.

@vertexvaar
Created November 5, 2020 18:10
Show Gist options
  • Save vertexvaar/43e513b9add3be4488a0a93f3fb5b06a to your computer and use it in GitHub Desktop.
Save vertexvaar/43e513b9add3be4488a0a93f3fb5b06a to your computer and use it in GitHub Desktop.
Parallel Browser Crawler concept
<?php
// Library Code
class Browser
{
public function createPage(): Page
{
return new Page();
}
}
<?php
class BrowserTab
{
private Page $page;
private $isBusy = false;
public function __construct(Page $page)
{
$this->page = $page;
}
public function visit(string $url, Closure $closure)
{
$this->isBusy = true;
/* await */ $this->page->visit($url);
$closure($this->page->getResponse());
$this->isBusy = false;
}
public function isBusy(): bool
{
return $this->isBusy;
}
}
<?php
class BrowserTabCollection
{
/** @var BrowserTab[] */
private $browserTabs;
public function __construct(Browser $browser, int $tabsCount)
{
for ($i = 0; $i < $tabsCount; $i++) {
$this->browserTabs[$i] = new BrowserTab($browser->createPage());
}
}
public function getIdleBrowserTab(): BrowserTab
{
while (true) {
foreach ($this->browserTabs as $browserTab) {
if (!$browserTab->isBusy()) {
return $browserTab;
}
}
usleep(100000);
}
}
public function isBusy(): bool
{
foreach ($this->browserTabs as $browserTab) {
if ($browserTab->isBusy()) {
return true;
}
}
return false;
}
}
<?php
class Crawler
{
private $queue;
private $browserTabCollection;
private $resultStorage = [];
public function __construct()
{
$this->queue = new SplQueue();
$this->browserTabCollection = new BrowserTabCollection(new Browser(), 5);
}
public function crawl(string $url): void
{
$this->queue->enqueue($url);
$this->work();
}
private function work()
{
$closure = $this->getPageIndexerClosure();
while (!$this->queue->isEmpty() || $this->browserTabCollection->isBusy()) {
$url = $this->queue->dequeue();
$tab = $this->browserTabCollection->getIdleBrowserTab();
$tab->visit($url, $closure);
}
}
private function getPageIndexerClosure(): Closure
{
return function (Response $response) {
foreach ($this->getAllLinks($response->html) as $url) {
$this->queue->enqueue($url);
}
$this->resultStorage[$response->url] = $response->cookies;
};
}
private function getAllLinks(string $html): array
{
// TODO: implement logic
return [];
}
}
<?php
// Library Code
class Page
{
public function visit(string $url): void
{
}
}
<?php
// Library Code
class Response
{
public $url = '';
public $html = '';
public $cookies = [];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment