Skip to content

Instantly share code, notes, and snippets.

@crazy4groovy
Created June 18, 2012 15:47
Show Gist options
  • Save crazy4groovy/2949017 to your computer and use it in GitHub Desktop.
Save crazy4groovy/2949017 to your computer and use it in GitHub Desktop.
Scrape the web using PhantomJS and jQuery
//This is an example of how to scrape the web using PhantomJS and jQuery:
//source: http://snippets.aktagon.com/snippets/534-How-to-scrape-web-pages-with-PhantomJS-and-jQuery
//http://phantomjs.org/
var page = new WebPage(),
url = 'http://localhost/a-search-form',
stepIndex = 0;
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments:
* the string for the message, the line number, and the source identifier.
*/
page.onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
*/
page.onAlert = function (msg) {
console.log('alert!!> ' + msg);
};
// Callback is executed each time a page is loaded...
page.open(url, function (status) {
if (status === 'success') {
// State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
console.log('============================================');
console.log('Step "' + stepIndex + '"');
console.log('============================================');
// Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
page.injectJs('jquery-1.6.1.min.js');
// Our "event loop"
if(!phantom.state){
initialize();
} else {
phantom.state();
}
// Save screenshot for debugging purposes
page.render("step" + stepIndex++ + ".png");
}
});
// Step 1
function initialize() {
page.evaluate(function() {
$('form#search input.query').val('Jebus saves');
$('form#search').submit();
console.log('Searching...');
});
// Phantom state doesn't change between page reloads
// We use the state to store the search result handler, ie. the next step
phantom.state = parseResults;
}
// Step 2
function parseResults() {
page.evaluate(function() {
$('#search-result a').each(function(index, link) {
console.log($(link).attr('href'));
})
console.log('Parsed results');
});
// If there was a 3rd step we could point to another function
// but we would have to reload the page for the callback to be called again
phantom.exit();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment