Created
June 18, 2012 15:47
-
-
Save crazy4groovy/2949017 to your computer and use it in GitHub Desktop.
Scrape the web using PhantomJS and jQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//This is an example of how to scrape the web using PhantomJS and jQuery: | |
//source: http://snippets.aktagon.com/snippets/534-How-to-scrape-web-pages-with-PhantomJS-and-jQuery | |
//http://phantomjs.org/ | |
var page = new WebPage(), | |
url = 'http://localhost/a-search-form', | |
stepIndex = 0; | |
/** | |
* From PhantomJS documentation: | |
* This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments: | |
* the string for the message, the line number, and the source identifier. | |
*/ | |
page.onConsoleMessage = function (msg, line, source) { | |
console.log('console> ' + msg); | |
}; | |
/** | |
* From PhantomJS documentation: | |
* This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message. | |
*/ | |
page.onAlert = function (msg) { | |
console.log('alert!!> ' + msg); | |
}; | |
// Callback is executed each time a page is loaded... | |
page.open(url, function (status) { | |
if (status === 'success') { | |
// State is initially empty. State is persisted between page loads and can be used for identifying which page we're on. | |
console.log('============================================'); | |
console.log('Step "' + stepIndex + '"'); | |
console.log('============================================'); | |
// Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file) | |
page.injectJs('jquery-1.6.1.min.js'); | |
// Our "event loop" | |
if(!phantom.state){ | |
initialize(); | |
} else { | |
phantom.state(); | |
} | |
// Save screenshot for debugging purposes | |
page.render("step" + stepIndex++ + ".png"); | |
} | |
}); | |
// Step 1 | |
function initialize() { | |
page.evaluate(function() { | |
$('form#search input.query').val('Jebus saves'); | |
$('form#search').submit(); | |
console.log('Searching...'); | |
}); | |
// Phantom state doesn't change between page reloads | |
// We use the state to store the search result handler, ie. the next step | |
phantom.state = parseResults; | |
} | |
// Step 2 | |
function parseResults() { | |
page.evaluate(function() { | |
$('#search-result a').each(function(index, link) { | |
console.log($(link).attr('href')); | |
}) | |
console.log('Parsed results'); | |
}); | |
// If there was a 3rd step we could point to another function | |
// but we would have to reload the page for the callback to be called again | |
phantom.exit(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment