-
-
Save Manouchehri/58e83738e72930380527e02b2d7b1782 to your computer and use it in GitHub Desktop.
Website Crawler using PhantomJS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var targetAddress = 'http://www.autohotkey.com/board/'; | |
var fileCount = 0; | |
phantom.onError = function(msg, trace) { | |
var msgStack = ['PHANTOM ERROR: ' + msg]; | |
if (trace && trace.length) { | |
msgStack.push('TRACE:'); | |
trace.forEach(function(t) { | |
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : '')); | |
}); | |
} | |
//console.error(msgStack.join('\n')); | |
console.log(msgStack.join('\n')); | |
phantom.exit(1); | |
}; | |
//phantom.injectJs('includes/URLutils.js'); // https://gist.github.com/Yaffle/1088850 | |
phantom.injectJs('includes/URLutils.js'); | |
var URL = URLUtils; | |
var browser = new Object(); // browser object | |
browser.name = 'browser'; | |
browser.initPage = function () { | |
var webPage = require('webpage'); | |
var page = webPage.create(); | |
page.onConsoleMessage = function(msg) { | |
console.log(msg); | |
}; | |
// page.onError = function (msg, trace) { | |
// var msgStack = ['PAGE ERROR: ' + msg]; | |
// if (trace && trace.length) { | |
// msgStack.push('TRACE:'); | |
// trace.forEach(function(t) { | |
// msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : '')); | |
// }); | |
// } | |
// //console.error(msgStack.join('\n')); | |
// //console.log(msgStack.join('\n')); | |
// throw new Error(msgStack.join('\n')); | |
// //phantom.exit(1); | |
// }; | |
page.viewportSize = { width: 1024, height: 1024 }; | |
// dimensions = "" | |
// size = dimensions.split('*'); | |
// page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' } | |
// : { format: dimensions, orientation: 'portrait', margin: '1cm' }; | |
//console.log('1. page initialized'); | |
return(page); | |
}; | |
browser.page = {}; | |
//browser.page = browser.initPage(); // first run is wasted... what to do about it? | |
browser.refreshPage = function() { | |
delete this.page; | |
//this.page = null; | |
this.page = this.initPage(); | |
}; | |
browser.processPage = function (address, mainHostname, callback) { | |
var page = this.page; | |
//console.log('processPage has started'); | |
page.open(address, function (status) { | |
if (status !== 'success') { | |
console.log('Unable to load '+ address); | |
phantom.exit(1); | |
} else { | |
//var self = this; | |
window.setTimeout(function() { | |
var hrefs, title, output; | |
//console.log(page); | |
title = page.evaluate(function() { | |
//console.log(document.title); | |
return document.title; | |
}); | |
title = fileCount++; | |
output = mainHostname + '\\' + title + ".pdf"; | |
//console.log('5. rendering ' + output + '\n'); | |
page.render(output); // save page to pdf using data from page for filename | |
page.injectJs('includes/jquery.js'); | |
//console.log(page.injectJs('includes/jquery.js') ? "6. jQuery injected successfully." : "jQuery was not injected"); | |
//console.log('7. collecting links'); | |
hrefs = page.evaluate(function() { | |
var links = []; | |
$('a').each(function(index, value) { | |
//console.log($(this).attr('href')); | |
links.push(btoa($(this).attr('href'))); | |
}); | |
//console.log(links); | |
return links; | |
}); | |
//this.refreshPage(); | |
//console.log(hrefs); | |
callback(hrefs); | |
}, 5000); | |
} | |
}); | |
//console.log('3. processPage has completed'); | |
}; | |
browser.load = function(address, mainHostname, callback, cbObject){ | |
this.refreshPage(); | |
//console.log(this.name); | |
//console.log(address, mainHostname, callback, cbObject); | |
this.processPage(address, mainHostname, callback.bind(cbObject)); | |
}; | |
var linkProcessor = new Object(); // controller object | |
linkProcessor.name = 'linkProcessor'; | |
linkProcessor.mainURL = new URL(targetAddress.toLowerCase()); | |
linkProcessor.mainHostname = linkProcessor.mainURL.hostname; | |
linkProcessor.url = linkProcessor.mainURL; | |
linkProcessor.linkList = []; | |
//linkProcessor.linkList = [linkProcessor.url.href]; // * add first page to the pageList, set savedStatus = false | |
linkProcessor.pdfList = []; | |
linkProcessor.newLinkHandler = {}; | |
var newLinkHandler = linkProcessor.newLinkHandler; | |
//newLinkHandler.hrefs = []; | |
//newLinkHandler.ready = false; | |
newLinkHandler.name = 'newLinkHandler'; | |
newLinkHandler.get = function () { | |
return(this.hrefs); | |
}; | |
newLinkHandler.isReady = function () { | |
//console.log(this.name + '.isReady = ' + this.ready); | |
return(this.ready); | |
}; | |
newLinkHandler.set = function (hrefsArray) { | |
this.hrefs = hrefsArray; | |
for(var i = 0, tot = hrefsArray.length; i < tot; ++i) { | |
this.hrefs.push(atob(hrefsArray[i])); | |
} | |
this.ready = true; | |
//console.log(this.hrefs); | |
//console.log('8. newLinkHandler has been set'); | |
//console.log('8. newLinkHandler has been set' + "\n" + this.hrefs); | |
}; | |
newLinkHandler.reset = function () { | |
this.hrefs = []; | |
this.ready = false; | |
//console.log('2. newLinkHandler reset'); | |
}; | |
linkProcessor.waitFor = function (testFx, onReady, timeOutMillis) { // https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js | |
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 30000, //< Default Max Timout is 3s | |
start = new Date().getTime(), | |
condition = false, | |
interval = setInterval(function() { | |
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) { | |
// If not time-out yet and condition not yet fulfilled | |
//console.log('triggering testFx in ' + this.name + '.waitFor'); | |
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code | |
} else { | |
if(!condition) { | |
// If condition still not fulfilled (timeout but condition is 'false') | |
console.log("time out error"); | |
phantom.exit(1); | |
} else { | |
// Condition fulfilled (timeout and/or condition is 'true') | |
//console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms."); | |
//console.log(this.name); | |
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled | |
clearInterval(interval); //< Stop this interval | |
} | |
} | |
}, 3000); //< repeat check every 250ms | |
}; | |
linkProcessor.resume = function(browser) { | |
var hrefs, link, n, count, url; | |
//console.log('resuming at ' + this.name + '.resume'); | |
//console.log(this.url.hostname); | |
if(this.url.hostname === this.mainHostname){ // if the page is in the domain, add new hrefs to pageList | |
hrefs = this.newLinkHandler.hrefs; | |
for (var n=0, count=hrefs.length; n < count; ++n) { | |
// find javascript link destination - unimplemented | |
link = hrefs[n].toLowerCase(); | |
//console.log('filing new link\n\n' + link + '\n'); | |
//console.log("/"); | |
if( link.charAt(0) === "/") { // fix relative link | |
//console.log('fixing new relative link\n\n' + link + '\n'); | |
url = new URL(link, this.mainHostname); | |
link = url.href; | |
} | |
if (link.indexOf("http://") != 0 && link.indexOf("https://") != 0) { | |
//console.log(link.indexOf("http://")); | |
//console.log('inspecting\n\n' + link + '\n'); | |
//console.log('falsifying new link\n\n' + link + '\n'); | |
link = false; // discard for now, implement better handling later | |
} | |
if(link != false && this.linkList.indexOf(link) === -1 && this.pdfList.indexOf(link) === -1 && this.url.href !== link ) { | |
//console.log('pushing this new link into the linkList\n\n' + link + '\n'); | |
this.linkList.push(link); | |
} | |
} | |
} | |
this.pdfList.push(this.url.href); | |
if(this.linkList.length > 0) { | |
this.url = new URL(this.linkList.pop()); | |
this.start(browser); | |
} else { | |
console.log('finished'); | |
phantom.exit(); | |
} | |
}; | |
linkProcessor.start = function(browser){ | |
//console.log(this.name + ".start will attempt to fetch \n\n" + this.url.href + "\n"); | |
console.log(fileCount + '.pdf, ' + this.url.href); | |
this.newLinkHandler.reset(); | |
//console.log(this.url.href, this.mainHostname); | |
//browser.load(this.url.href, this.mainHostname, this.newLinkHandler.set.bind(this.newLinkHandler)); | |
//console.log('====='); | |
//console.log(this.url.href, this.mainHostname, this.newLinkHandler.set, this.newLinkHandler); | |
//console.log('====='); | |
browser.load(this.url.href, this.mainHostname, this.newLinkHandler.set, this.newLinkHandler); | |
//this.waitFor(this.newLinkHandler.isReady, this.resume); | |
this.waitFor(this.newLinkHandler.isReady.bind(this.newLinkHandler), this.resume.bind(this, browser)); | |
//waitFor(this.newLinkHandler.isReady.bind(this), this.resume.bind(this)); | |
//waitFor(function(){return(this.newLinkHandler.isReady);}, function(){this.resume}); | |
//waitFor(function(){return(this.newLinkHandler.isReady);}, function(){this.resume}); | |
}; | |
(function initFilesystem(hostname) { | |
//console.log('preparing the filesystem'); | |
var fs = require('fs'); | |
var path = hostname + '/'; | |
if(fs.isDirectory(path) == false) { | |
if(fs.makeDirectory(path)) { | |
//console.log( path + ' directory was created.'); | |
} else { | |
console.log('error: ' + path + ' directory could not be created.'); | |
} | |
} else { | |
//console.log( path + " directory already exists."); | |
}; | |
})(linkProcessor.mainHostname); | |
//console.log('beginning'); | |
console.log('file name, url address'); | |
linkProcessor.start(browser); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment