Created
June 10, 2015 10:33
-
-
Save witwall/1ae8068c0cd57c0d0fe6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*jshint strict:false*/ | |
/*global CasperError, console, phantom, require*/ | |
/** | |
* Capture multiple pages of google search results | |
* | |
* Usage: | |
* | |
* $ casperjs googleresults.js my search terms | |
* $ casperjs googleresults.js my search terms --limit=5 | |
* $ casperjs googleresults.js my search terms --stream | |
* | |
* (all arguments will be used as the query) | |
*/ | |
var links = []; | |
var casper = require("casper").create({ | |
waitTimeout: 1000, | |
pageSettings: { | |
//userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0" | |
userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36" | |
} | |
}); | |
var currentPage = 1; | |
// Parse cli arguments | |
// search terms | |
var search = casper.cli.args.join(" "); | |
// Number of page to crawl | |
var limit = casper.cli.options.limit || 10; | |
// print results as live stream | |
var stream = casper.cli.options.stream; | |
// return rich objects | |
var rich = casper.cli.options.rich; | |
var help = casper.cli.options.about; | |
var wait = isNumber(casper.cli.options.wait) ? casper.cli.options.wait : 1; | |
var screenshot = casper.cli.options.screenshot; | |
if (help) { | |
usage(); | |
} | |
if (search.length === 0) { | |
usage(); | |
} | |
casper.on('error', function (err) { | |
casper.log(err, 'error'); | |
casper.capture('error.png'); | |
casper.exit(1); | |
}); | |
function usage() { | |
casper | |
.echo("Return a list of google results formated in JSON.") | |
.echo("") | |
.echo(" Usage:") | |
.echo(" $ casperjs googleresults.js casperjs") | |
.echo(" $ casperjs googleresults.js learn casperjs --limit=5 --stream") | |
.echo("") | |
.echo(" Options:") | |
.echo(" --about show this help.") | |
.echo(" --limit=LIMIT crawl LIMIT google pages (default 10).") | |
.echo(" --stream return results when available. This writes formated results as soon as it is extracted.") | |
.echo(" --rich return json objects instead of raw url.") | |
.echo(" --wait time to wait before parsing google results.") | |
.echo(" --screenshots directory where to store screenshots") | |
.echo("") | |
.exit(1) | |
; | |
} | |
function isNumber(n) { | |
return !isNaN(parseFloat(n)) && isFinite(n); | |
} | |
// Retrieve links from a google results page page | |
function getLinks(rich) { | |
//var links = document.querySelectorAll("h3.r a"); | |
return Array.prototype.map.call(document.querySelectorAll("h3.r a"), function(e) { | |
var href; | |
try { | |
// google handles redirects hrefs to some script of theirs | |
href = (/url\?q=(.*)&sa=U/).exec(e.getAttribute("href"))[1]; | |
} catch (err) { | |
href = e.getAttribute("href"); | |
} | |
if (!rich) { | |
return href; | |
} | |
return { | |
href: href, | |
title: e.innerText | |
}; | |
}); | |
} | |
function serializeLinks(links) { | |
return JSON.stringify({type: 'links', links: links}); | |
} | |
// write links to output | |
function formatLinks(links) { | |
if (!links instanceof Array) { | |
links = [links]; | |
} | |
// backward compatibility requires old format. | |
var serialized = stream ? serializeLinks(links) : JSON.stringify(links); | |
casper.echo(serialized); | |
} | |
function serializeScreenshot(filename) { | |
return JSON.stringify({type: 'screenshot', filename: filename}); | |
} | |
function formatScreenshot(filename) { | |
return casper.echo(serializeScreenshot(filename)); | |
} | |
function takeScreenshot() { | |
var screenshotFile = (+new Date()).toString(36) + '.png'; | |
casper | |
.capture(screenshot + '/' + screenshotFile) | |
.then(function () { | |
// send screenshot | |
if (stream) { | |
formatScreenshot(screenshotFile); | |
} | |
}); | |
return casper; | |
} | |
// handle page crawling | |
var processPage = function() { | |
// emulate a user looking at results with a random time | |
var waitTime = wait + (Math.random() * 3); | |
this | |
//.echo('Will wait for ' + Math.floor(waitTime)) | |
.wait(waitTime * 1000); | |
var url; | |
var pageLinks; | |
// capturing current page | |
this | |
.then(function () { | |
// get all available links | |
pageLinks = this.evaluate(getLinks, rich); | |
links = links.concat(pageLinks); | |
if (screenshot) { | |
takeScreenshot(); | |
} | |
// if stream, then write to output | |
if (stream) { | |
formatLinks(pageLinks); | |
} | |
// don't go too far down the rabbit hole | |
if (currentPage >= limit || !this.exists("#pnnext")) { | |
return terminate.call(casper); | |
} | |
currentPage++; | |
// Requesting next page | |
url = this.getCurrentUrl(); | |
this | |
// click on page next | |
.thenClick("#pnnext") | |
// wait url changes | |
.then(function() { | |
this.waitFor(function() { | |
return url !== this.getCurrentUrl(); | |
}, processPage, terminate); | |
}); | |
}); | |
}; | |
// write links to the output if not streamed. | |
function terminate(err){ | |
if (screenshot) { | |
takeScreenshot(); | |
} | |
casper | |
.then(function () { | |
if (!stream) { | |
formatLinks(links); | |
} | |
}); | |
} | |
casper.start("https://www.google.com/?hl=en", function() { | |
this.fill('form[action="/search"]', { q: search }, true); | |
}); | |
//casper.thenClick('#gbqfb'); | |
casper | |
.waitForSelector('#pnnext', processPage, terminate); | |
casper.run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment