Created
February 26, 2019 05:29
-
-
Save tushargugnani/c4f1cee52c8395810e563f4d0732a130 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Tests\Browser; | |
use App\Page; | |
use Facebook\WebDriver\WebDriverBy; | |
use Tests\DuskTestCase; | |
use Laravel\Dusk\Browser; | |
use Illuminate\Foundation\Testing\DatabaseMigrations; | |
class duskSpiderTest extends DuskTestCase | |
{ | |
protected static $domain = 'laravel.com'; | |
protected static $startUrl = 'https://laravel.com/'; | |
public function setUp(): void{ | |
parent::setUp(); | |
$this->artisan('migrate:fresh'); | |
} | |
/** @test */ | |
public function urlSpider() | |
{ | |
$startingLink = Page::create([ | |
'url' => self::$startUrl, | |
'isCrawled' => false, | |
]); | |
$this->browse(function (Browser $browser) use ($startingLink) { | |
$this->getLinks($browser, $startingLink); | |
}); | |
} | |
protected function getLinks(Browser $browser, $currentUrl){ | |
$this->processCurrentUrl($browser, $currentUrl); | |
try{ | |
foreach(Page::where('isCrawled', false)->get() as $link) { | |
$this->getLinks($browser, $link); | |
} | |
}catch(Exception $e){ | |
} | |
} | |
protected function processCurrentUrl(Browser $browser, $currentUrl){ | |
//Check if already crawled | |
if(Page::where('url', $currentUrl->url)->first()->isCrawled == true) | |
return; | |
//Visit URL | |
$browser->visit($currentUrl->url); | |
//Get Links and Save to DB if Valid | |
$linkElements = $browser->driver->findElements(WebDriverBy::tagName('a')); | |
foreach($linkElements as $element){ | |
$href = $element->getAttribute('href'); | |
$href = $this->trimUrl($href); | |
if($this->isValidUrl($href)){ | |
//var_dump($href); | |
Page::create([ | |
'url' => $href, | |
'isCrawled' => false, | |
]); | |
} | |
} | |
//Update current url status to crawled | |
$currentUrl->isCrawled = true; | |
$currentUrl->status = $this->getHttpStatus($currentUrl->url); | |
$currentUrl->title = $browser->driver->getTitle(); | |
$currentUrl->save(); | |
} | |
protected function isValidUrl($url){ | |
$parsed_url = parse_url($url); | |
if(isset($parsed_url['host'])){ | |
if(strpos($parsed_url['host'], self::$domain) !== false && !Page::where('url', $url)->exists()){ | |
return true; | |
} | |
} | |
return false; | |
} | |
protected function trimUrl($url){ | |
$url = strtok($url, '#'); | |
$url = rtrim($url,"/"); | |
return $url; | |
} | |
protected function getHttpStatus($url){ | |
$headers = get_headers($url, 1); | |
return intval(substr($headers[0], 9, 3)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment