Skip to content

Instantly share code, notes, and snippets.

@phpfiddle
Created July 25, 2013 04:03
Show Gist options
  • Save phpfiddle/6076828 to your computer and use it in GitHub Desktop.
Save phpfiddle/6076828 to your computer and use it in GitHub Desktop.
New Modified Google Scraper...
<?php
require_once('simple_html_dom.php');
class GoogleScraper
{
private $_results;
private $_baseUrl;
private $_searchQuery;
private $_resultsPerPage;
/**
* constructor
* I use the constructor to set all the defaults to keep it all in one place
*/
final public function __construct($baseUrl='')
{
$this->_results = array();
$this->_resultsPerPage = 100;
if (empty($baseUrl)) {
$this->_baseUrl = 'https://www.google.com';
} else {
$this->_baseUrl = $baseUrl;
}
}
/**
* cleanup
*/
final public function __destruct()
{
unset($this->_results);
unset($this->_baseUrl);
unset($this->_searchQuery);
}
/**
* Set the query
*/
final public function SearchQuery($searchQuery)
{
if (!(is_string($searchQuery) || is_numeric($searchQuery))) {
throw new Exception('Invalid query type');
}
$this->_searchQuery = $searchQuery;
}
/**
* Set the number of results per page
*/
final public function ResultsPerPage($resultsPerPage)
{
if (!is_int($resultsPerPage) || $resultsPerPage<10 || $resultsPerPage>100) {
throw new Exception('Results per page must be value between 10 and 100');
}
$this->_resultsPerPage = $resultsPerPage;
}
/**
* Get the result
*/
final public function GetResults() { return $this->_results; }
/**
* Scrape the search results
*/
final public function LoadPages($pages=1, $cPage = 1)
{
if (!is_int($pages) || $pages<1) {
throw new Exception('Invalid number of pages');
}
if (empty($this->_searchQuery)) {
throw new Exception('Missing search query');
}
$currentPage = 1;
$start_page = '';
if($cPage > 1){
$cPage = ($cPage-1)*$pages;
$start_page = '$start=' . ($cPage*$this->_resultsPerPage);
$currentPage = $cPage+1;
}
$url = $this->_baseUrl . '/search?num='.$this->_resultsPerPage.'&q=' . urlencode($this->_searchQuery) . $start_page;
while($pages--)
{
if ($content = $this->LoadUrl($url))
{
//Load content in to simple html dom
$html = new simple_html_dom();
$html->load($content);
//Find and handle search results
$items = $html->find('div#ires li');
foreach($items as $item)
{
//Only normal search results have this container. Special results like found images or news dont have it.
$check = $item->find('div.s');
if (count($check)!=1) {
continue;
}
$head = $item->find('h3.r a', 0);
$result['title'] = $head->plaintext;
//If we dont have a title, there is no point in continuing
if (empty($result['title'])) {
continue;
}
$result['href'] = $head->href;
//Check if we can parse the URL for the actual url
if (!empty($result['href']))
{
$qs = explode('?', $result['href']);
if (!empty($qs[1]))
{
parse_str($qs[1], $querystring);
if (!empty($querystring['q'])) {
$result['href'] = $querystring['q'];
}
}
}
//Try to find the description
//$info = $item->find('span.st', 0);
//$result['description'] = $info->plaintext;
//Add the results to the total
$this->_results[] = $result;
}
//Find next page
$url = $this->_baseUrl . '/search?num='.$this->_resultsPerPage.'&q=' . urlencode($this->_searchQuery) . '$start=' . ($currentPage*$this->_resultsPerPage);
} else {
throw new Exception('Failed to load page');
}
$currentPage++;
}
}
/**
* Load the url
*/
final private function LoadUrl($url)
{
if (!is_string($url)) {
throw new Exception('Invalid url');
}
$options['http'] = array(
'user_agent' => "GoogleScraper",
'timeout' => 5.5
);
$context = stream_context_create($options);
$content = file_get_contents($url, null, $context);
if (!empty($http_response_header)) {
return (substr_count($http_response_header[0], ' 200 OK')>0) ? $content : false;
}
return false;
}
}
?>
<?php
$gs = new GoogleScraper();
$gs->SearchQuery('online exams');
$t = microtime(true);
$gs->LoadPages(20);
echo 'Loaded 20 pages in ' . (microtime(true)-$t) . 'sec';
$i=0;
$domains = array();
foreach ($gs->GetResults() as $dom)
{
$domains[$i] = $dom;
$i++;
// echo '<pre>';
// print_r($dom);
// //var_dump($gs->GetResults());
// echo '</pre>';
}
echo "<pre>";
print_r($domains);
exit;
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment