Created
July 25, 2013 04:03
-
-
Save phpfiddle/6076828 to your computer and use it in GitHub Desktop.
New Modified Google Scraper...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require_once('simple_html_dom.php'); | |
class GoogleScraper | |
{ | |
private $_results; | |
private $_baseUrl; | |
private $_searchQuery; | |
private $_resultsPerPage; | |
/** | |
* constructor | |
* I use the constructor to set all the defaults to keep it all in one place | |
*/ | |
final public function __construct($baseUrl='') | |
{ | |
$this->_results = array(); | |
$this->_resultsPerPage = 100; | |
if (empty($baseUrl)) { | |
$this->_baseUrl = 'https://www.google.com'; | |
} else { | |
$this->_baseUrl = $baseUrl; | |
} | |
} | |
/** | |
* cleanup | |
*/ | |
final public function __destruct() | |
{ | |
unset($this->_results); | |
unset($this->_baseUrl); | |
unset($this->_searchQuery); | |
} | |
/** | |
* Set the query | |
*/ | |
final public function SearchQuery($searchQuery) | |
{ | |
if (!(is_string($searchQuery) || is_numeric($searchQuery))) { | |
throw new Exception('Invalid query type'); | |
} | |
$this->_searchQuery = $searchQuery; | |
} | |
/** | |
* Set the number of results per page | |
*/ | |
final public function ResultsPerPage($resultsPerPage) | |
{ | |
if (!is_int($resultsPerPage) || $resultsPerPage<10 || $resultsPerPage>100) { | |
throw new Exception('Results per page must be value between 10 and 100'); | |
} | |
$this->_resultsPerPage = $resultsPerPage; | |
} | |
/** | |
* Get the result | |
*/ | |
final public function GetResults() { return $this->_results; } | |
/** | |
* Scrape the search results | |
*/ | |
final public function LoadPages($pages=1, $cPage = 1) | |
{ | |
if (!is_int($pages) || $pages<1) { | |
throw new Exception('Invalid number of pages'); | |
} | |
if (empty($this->_searchQuery)) { | |
throw new Exception('Missing search query'); | |
} | |
$currentPage = 1; | |
$start_page = ''; | |
if($cPage > 1){ | |
$cPage = ($cPage-1)*$pages; | |
$start_page = '$start=' . ($cPage*$this->_resultsPerPage); | |
$currentPage = $cPage+1; | |
} | |
$url = $this->_baseUrl . '/search?num='.$this->_resultsPerPage.'&q=' . urlencode($this->_searchQuery) . $start_page; | |
while($pages--) | |
{ | |
if ($content = $this->LoadUrl($url)) | |
{ | |
//Load content in to simple html dom | |
$html = new simple_html_dom(); | |
$html->load($content); | |
//Find and handle search results | |
$items = $html->find('div#ires li'); | |
foreach($items as $item) | |
{ | |
//Only normal search results have this container. Special results like found images or news dont have it. | |
$check = $item->find('div.s'); | |
if (count($check)!=1) { | |
continue; | |
} | |
$head = $item->find('h3.r a', 0); | |
$result['title'] = $head->plaintext; | |
//If we dont have a title, there is no point in continuing | |
if (empty($result['title'])) { | |
continue; | |
} | |
$result['href'] = $head->href; | |
//Check if we can parse the URL for the actual url | |
if (!empty($result['href'])) | |
{ | |
$qs = explode('?', $result['href']); | |
if (!empty($qs[1])) | |
{ | |
parse_str($qs[1], $querystring); | |
if (!empty($querystring['q'])) { | |
$result['href'] = $querystring['q']; | |
} | |
} | |
} | |
//Try to find the description | |
//$info = $item->find('span.st', 0); | |
//$result['description'] = $info->plaintext; | |
//Add the results to the total | |
$this->_results[] = $result; | |
} | |
//Find next page | |
$url = $this->_baseUrl . '/search?num='.$this->_resultsPerPage.'&q=' . urlencode($this->_searchQuery) . '$start=' . ($currentPage*$this->_resultsPerPage); | |
} else { | |
throw new Exception('Failed to load page'); | |
} | |
$currentPage++; | |
} | |
} | |
/** | |
* Load the url | |
*/ | |
final private function LoadUrl($url) | |
{ | |
if (!is_string($url)) { | |
throw new Exception('Invalid url'); | |
} | |
$options['http'] = array( | |
'user_agent' => "GoogleScraper", | |
'timeout' => 5.5 | |
); | |
$context = stream_context_create($options); | |
$content = file_get_contents($url, null, $context); | |
if (!empty($http_response_header)) { | |
return (substr_count($http_response_header[0], ' 200 OK')>0) ? $content : false; | |
} | |
return false; | |
} | |
} | |
?> | |
<?php | |
$gs = new GoogleScraper(); | |
$gs->SearchQuery('online exams'); | |
$t = microtime(true); | |
$gs->LoadPages(20); | |
echo 'Loaded 20 pages in ' . (microtime(true)-$t) . 'sec'; | |
$i=0; | |
$domains = array(); | |
foreach ($gs->GetResults() as $dom) | |
{ | |
$domains[$i] = $dom; | |
$i++; | |
// echo '<pre>'; | |
// print_r($dom); | |
// //var_dump($gs->GetResults()); | |
// echo '</pre>'; | |
} | |
echo "<pre>"; | |
print_r($domains); | |
exit; | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment