Created
November 27, 2015 22:53
-
-
Save weeksdev/a196a0c534d0156a639d to your computer and use it in GitHub Desktop.
PhantomJs Escaped Fragment Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'), | |
//root url to start crawl from, it will only look for hashes in a links for the specified baseUrl | |
//so for instance, if you had a link to some other website http://www.abc.com/something-here it's NOT going to parse it | |
baseUrl = 'http://localhost:3000/', | |
//folder to save the html pages to | |
saveFolder = 'public/_escaped_fragment_/', | |
//array containing every link already parsed | |
parsedLinks = []; | |
//method to parse given url/hash and iterator that recursively calls the additional pages | |
var checkPage = function (page, url, hash) { | |
parsedLinks.push(url + hash); | |
page.open(url + hash, function (status) { | |
var filePath = hash.replace('#!', '').replace('#', '') + '.html'; | |
if (filePath === '.html') { | |
filePath = 'Index.html'; | |
} | |
console.log(filePath); | |
setTimeout(function (){ | |
var content = page.evaluate(function () { | |
return document.body.innerHTML; | |
}); | |
fs.write(saveFolder + filePath, content, 'w'); | |
var hashes = page.evaluate(function () { | |
var elements = document.getElementsByTagName('a'); | |
hashes = []; | |
for (var i = 0; i < elements.length; i++) { | |
if (elements[i].hash !== '' && elements[i].href.indexOf(location.href) === 0) { | |
hashes.push(elements[i].hash); | |
} | |
} | |
return hashes | |
}); | |
hashes.forEach(function (hash) { | |
if (parsedLinks.indexOf(url + hash) == -1) { | |
checkPage(require('webpage').create(), url, hash); | |
} | |
}); | |
}, 2000) | |
}); | |
}; | |
//start the crawl | |
checkPage(require('webpage').create(), baseUrl, ''); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment