Skip to content

Instantly share code, notes, and snippets.

@loretoparisi
Created May 20, 2016 16:25
Show Gist options
  • Save loretoparisi/0da825688a000492c657290d7afacb5e to your computer and use it in GitHub Desktop.
Save loretoparisi/0da825688a000492c657290d7afacb5e to your computer and use it in GitHub Desktop.
A node phantom example to run a promise for evaluate a document condition Raw
(function() {
var Crawler,phantom,cheerio;
cheerio = require("cheerio"),
phantom = require('phantom');
var waitUntil = function(asyncTest) {
return new Promise(function(resolve, reject) {
function wait() {
console.log('loop...')
asyncTest().then(function(value) {
console.log('value', value)
if (value === true) {
resolve();
} else {
setTimeout(wait, 100);
}
}).catch(function(e) {
console.log('Error found. Rejecting.', e);
reject();
});
}
wait();
});
}
Crawler = (function() {
/**
* Simple Crawler
* @uses phantom module
* @author Loreto Parisi
*/
function Crawler(options,logger) {
this.logger=logger;
/** Command line phantom options */
this.options = {
debug : false, // true to debug page events
error : true, // true to log page errors
info : true, // true to log page info
commandline : [
'--ignore-ssl-errors=yes',
'--load-images=no',
'--ssl-protocol=tlsv1',
]
}
for (var attrname in options) { this.options[attrname] = options[attrname]; }
}
/**
* Fetch url
* @param done Block success Block
* @param fail Block error Block
*/
Crawler.prototype.fetchUrl = function(url, done, fail) {
var self=this;
var sitepage
,pageContent
,pageStatus
,pageResponse
,phInstance
,pageContext;
sitepage = phInstance = null;
// context for the page
pageContext = {
options : self.options
}
phantom.create(self.options.commandline)
.then(instance => {
phInstance = instance;
return instance.createPage();
})
.then(page => {
pageResponse = phInstance.createOutObject();
sitepage = page;
// attach events: resources
page.property('onResourceRequested', function(requestData, networkRequest,pageContext) {
if(pageContext.options.debug) console.log(requestData.url);
},pageContext);
page.property('onResourceReceived', function(response, networkRequest,pageContext) {
var msg = ' id: ' + response.id + ', stage: "' + response.stage + '", response: ' + JSON.stringify(response);
if(pageContext.options.debug) console.log(msg);
},pageContext);
page.property('onResourceError', function(resourceError, networkRequest,pageContext) {
var errorString = resourceError.errorString;
var errorPageUrl = resourceError.url;
if(pageContext.options.error) console.log(resourceError,errorString,errorPageUrl);
},pageContext);
// attach events: load
page.property('onLoadStarted', function(pageContext) {
if(pageContext.options.info) console.log('onLoadStarted');
},pageContext);
page.property('onLoadFinished', function(response,pageContext,pageResponse) {
if(pageContext.options.info) console.log('onLoadFinished');
},pageContext,pageResponse);
// events: navigation
page.property('onNavigationRequested', function(url, type, willNavigate, main,pageContext) {
var msg = ' destination_url: ' + url;
msg += ' type (cause): ' + type;
msg += ' will navigate: ' + willNavigate;
msg += ' from page\'s main frame: ' + main;
if(pageContext.options.debug) console.log(msg);
},pageContext);
// error
page.property('onError', function(requestData, networkRequest,pageContext) {
if(pageContext.options.error) console.log(requestData.url);
},pageContext);
return page.open(url);
})
.then(status => {
console.log("... - Page status [%s]",status);
pageStatus=status;
return sitepage.property('content');
})
.then(content => {
pageContent = content;
console.log("Content ready - Page Status [%s]",pageStatus);
//var $ = cheerio.load( pageContent );
if (pageStatus !== 'success') {
console.log("Error - Page status %s", pageStatus);
sitepage.close();
phInstance.exit();
if(fail) return fail(new Error( pageStatus) );
} else { // status is success
waitUntil(function() {
return sitepage.evaluate(function() {
return document.querySelectorAll('.content > ul > li').length > 1;
})
}).then(html => {
console.log("----------------------------------- HTML ------------------------------")
console.log(html);
console.log("----------------------------------- HTML ------------------------------")
sitepage.close();
phInstance.exit();
if(done) done( html );
})
}
})
.catch(error => {
console.log(error);
if(sitepage) sitepage.close();
phInstance.exit();
if(fail) return fail(error);
});
} //fetchUrl
return Crawler;
})();
module.exports = Crawler;
}).call(this);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment