Created
May 20, 2016 16:25
-
-
Save loretoparisi/0da825688a000492c657290d7afacb5e to your computer and use it in GitHub Desktop.
A node phantom example to run a promise for evaluate a document condition Raw
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(function() { | |
var Crawler,phantom,cheerio; | |
cheerio = require("cheerio"), | |
phantom = require('phantom'); | |
var waitUntil = function(asyncTest) { | |
return new Promise(function(resolve, reject) { | |
function wait() { | |
console.log('loop...') | |
asyncTest().then(function(value) { | |
console.log('value', value) | |
if (value === true) { | |
resolve(); | |
} else { | |
setTimeout(wait, 100); | |
} | |
}).catch(function(e) { | |
console.log('Error found. Rejecting.', e); | |
reject(); | |
}); | |
} | |
wait(); | |
}); | |
} | |
Crawler = (function() { | |
/** | |
* Simple Crawler | |
* @uses phantom module | |
* @author Loreto Parisi | |
*/ | |
function Crawler(options,logger) { | |
this.logger=logger; | |
/** Command line phantom options */ | |
this.options = { | |
debug : false, // true to debug page events | |
error : true, // true to log page errors | |
info : true, // true to log page info | |
commandline : [ | |
'--ignore-ssl-errors=yes', | |
'--load-images=no', | |
'--ssl-protocol=tlsv1', | |
] | |
} | |
for (var attrname in options) { this.options[attrname] = options[attrname]; } | |
} | |
/** | |
* Fetch url | |
* @param done Block success Block | |
* @param fail Block error Block | |
*/ | |
Crawler.prototype.fetchUrl = function(url, done, fail) { | |
var self=this; | |
var sitepage | |
,pageContent | |
,pageStatus | |
,pageResponse | |
,phInstance | |
,pageContext; | |
sitepage = phInstance = null; | |
// context for the page | |
pageContext = { | |
options : self.options | |
} | |
phantom.create(self.options.commandline) | |
.then(instance => { | |
phInstance = instance; | |
return instance.createPage(); | |
}) | |
.then(page => { | |
pageResponse = phInstance.createOutObject(); | |
sitepage = page; | |
// attach events: resources | |
page.property('onResourceRequested', function(requestData, networkRequest,pageContext) { | |
if(pageContext.options.debug) console.log(requestData.url); | |
},pageContext); | |
page.property('onResourceReceived', function(response, networkRequest,pageContext) { | |
var msg = ' id: ' + response.id + ', stage: "' + response.stage + '", response: ' + JSON.stringify(response); | |
if(pageContext.options.debug) console.log(msg); | |
},pageContext); | |
page.property('onResourceError', function(resourceError, networkRequest,pageContext) { | |
var errorString = resourceError.errorString; | |
var errorPageUrl = resourceError.url; | |
if(pageContext.options.error) console.log(resourceError,errorString,errorPageUrl); | |
},pageContext); | |
// attach events: load | |
page.property('onLoadStarted', function(pageContext) { | |
if(pageContext.options.info) console.log('onLoadStarted'); | |
},pageContext); | |
page.property('onLoadFinished', function(response,pageContext,pageResponse) { | |
if(pageContext.options.info) console.log('onLoadFinished'); | |
},pageContext,pageResponse); | |
// events: navigation | |
page.property('onNavigationRequested', function(url, type, willNavigate, main,pageContext) { | |
var msg = ' destination_url: ' + url; | |
msg += ' type (cause): ' + type; | |
msg += ' will navigate: ' + willNavigate; | |
msg += ' from page\'s main frame: ' + main; | |
if(pageContext.options.debug) console.log(msg); | |
},pageContext); | |
// error | |
page.property('onError', function(requestData, networkRequest,pageContext) { | |
if(pageContext.options.error) console.log(requestData.url); | |
},pageContext); | |
return page.open(url); | |
}) | |
.then(status => { | |
console.log("... - Page status [%s]",status); | |
pageStatus=status; | |
return sitepage.property('content'); | |
}) | |
.then(content => { | |
pageContent = content; | |
console.log("Content ready - Page Status [%s]",pageStatus); | |
//var $ = cheerio.load( pageContent ); | |
if (pageStatus !== 'success') { | |
console.log("Error - Page status %s", pageStatus); | |
sitepage.close(); | |
phInstance.exit(); | |
if(fail) return fail(new Error( pageStatus) ); | |
} else { // status is success | |
waitUntil(function() { | |
return sitepage.evaluate(function() { | |
return document.querySelectorAll('.content > ul > li').length > 1; | |
}) | |
}).then(html => { | |
console.log("----------------------------------- HTML ------------------------------") | |
console.log(html); | |
console.log("----------------------------------- HTML ------------------------------") | |
sitepage.close(); | |
phInstance.exit(); | |
if(done) done( html ); | |
}) | |
} | |
}) | |
.catch(error => { | |
console.log(error); | |
if(sitepage) sitepage.close(); | |
phInstance.exit(); | |
if(fail) return fail(error); | |
}); | |
} //fetchUrl | |
return Crawler; | |
})(); | |
module.exports = Crawler; | |
}).call(this); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment