Created
November 5, 2020 18:10
-
-
Save vertexvaar/43e513b9add3be4488a0a93f3fb5b06a to your computer and use it in GitHub Desktop.
Parallel Browser Crawler concept
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Library Code | |
class Browser | |
{ | |
public function createPage(): Page | |
{ | |
return new Page(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class BrowserTab | |
{ | |
private Page $page; | |
private $isBusy = false; | |
public function __construct(Page $page) | |
{ | |
$this->page = $page; | |
} | |
public function visit(string $url, Closure $closure) | |
{ | |
$this->isBusy = true; | |
/* await */ $this->page->visit($url); | |
$closure($this->page->getResponse()); | |
$this->isBusy = false; | |
} | |
public function isBusy(): bool | |
{ | |
return $this->isBusy; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class BrowserTabCollection | |
{ | |
/** @var BrowserTab[] */ | |
private $browserTabs; | |
public function __construct(Browser $browser, int $tabsCount) | |
{ | |
for ($i = 0; $i < $tabsCount; $i++) { | |
$this->browserTabs[$i] = new BrowserTab($browser->createPage()); | |
} | |
} | |
public function getIdleBrowserTab(): BrowserTab | |
{ | |
while (true) { | |
foreach ($this->browserTabs as $browserTab) { | |
if (!$browserTab->isBusy()) { | |
return $browserTab; | |
} | |
} | |
usleep(100000); | |
} | |
} | |
public function isBusy(): bool | |
{ | |
foreach ($this->browserTabs as $browserTab) { | |
if ($browserTab->isBusy()) { | |
return true; | |
} | |
} | |
return false; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class Crawler | |
{ | |
private $queue; | |
private $browserTabCollection; | |
private $resultStorage = []; | |
public function __construct() | |
{ | |
$this->queue = new SplQueue(); | |
$this->browserTabCollection = new BrowserTabCollection(new Browser(), 5); | |
} | |
public function crawl(string $url): void | |
{ | |
$this->queue->enqueue($url); | |
$this->work(); | |
} | |
private function work() | |
{ | |
$closure = $this->getPageIndexerClosure(); | |
while (!$this->queue->isEmpty() || $this->browserTabCollection->isBusy()) { | |
$url = $this->queue->dequeue(); | |
$tab = $this->browserTabCollection->getIdleBrowserTab(); | |
$tab->visit($url, $closure); | |
} | |
} | |
private function getPageIndexerClosure(): Closure | |
{ | |
return function (Response $response) { | |
foreach ($this->getAllLinks($response->html) as $url) { | |
$this->queue->enqueue($url); | |
} | |
$this->resultStorage[$response->url] = $response->cookies; | |
}; | |
} | |
private function getAllLinks(string $html): array | |
{ | |
// TODO: implement logic | |
return []; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Library Code | |
class Page | |
{ | |
public function visit(string $url): void | |
{ | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Library Code | |
class Response | |
{ | |
public $url = ''; | |
public $html = ''; | |
public $cookies = []; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment