weeksdev · November 27, 2015 22:53
diff --git a/PhantomJsCrawl.js b/PhantomJsCrawl.js
 var fs = require('fs'),
    //root url to start crawl from, it will only look for hashes in a links for the specified baseUrl
    //so for instance, if you had a link to some other website http://www.abc.com/something-here it's NOT going to parse it
    baseUrl = 'http://localhost:3000/',
    //folder to save the html pages to
    saveFolder = 'public/_escaped_fragment_/',
    //array containing every link already parsed
    parsedLinks = [];
 //method to parse given url/hash and iterator that recursively calls the additional pages
 var checkPage = function (page, url, hash) {
    parsedLinks.push(url + hash);
    page.open(url + hash, function (status) {
        var filePath = hash.replace('#!', '').replace('#', '') + '.html';
        if (filePath === '.html') {
            filePath = 'Index.html';
        }
        console.log(filePath);
        setTimeout(function (){
            var content = page.evaluate(function () {
                return document.body.innerHTML;
            });
            fs.write(saveFolder + filePath, content, 'w');
            var hashes = page.evaluate(function () {
                var elements = document.getElementsByTagName('a');
                hashes = [];
                for (var i = 0; i < elements.length; i++) {
                    if (elements[i].hash !== '' && elements[i].href.indexOf(location.href) === 0) {
                        hashes.push(elements[i].hash);
                    }
                }
                return hashes
            });
            
            hashes.forEach(function (hash) {
                if (parsedLinks.indexOf(url + hash) == -1) {
                    checkPage(require('webpage').create(), url, hash);
                }
            });
        }, 2000)
    });
 };
 //start the crawl
 checkPage(require('webpage').create(), baseUrl, '');
	var fs = require('fs'),
	//root url to start crawl from, it will only look for hashes in a links for the specified baseUrl
	//so for instance, if you had a link to some other website http://www.abc.com/something-here it's NOT going to parse it
	baseUrl = 'http://localhost:3000/',
	//folder to save the html pages to
	saveFolder = 'public/_escaped_fragment_/',
	//array containing every link already parsed
	parsedLinks = [];
	//method to parse given url/hash and iterator that recursively calls the additional pages
	var checkPage = function (page, url, hash) {
	parsedLinks.push(url + hash);
	page.open(url + hash, function (status) {
	var filePath = hash.replace('#!', '').replace('#', '') + '.html';
	if (filePath === '.html') {
	filePath = 'Index.html';
	}
	console.log(filePath);
	setTimeout(function (){
	var content = page.evaluate(function () {
	return document.body.innerHTML;
	});
	fs.write(saveFolder + filePath, content, 'w');
	var hashes = page.evaluate(function () {
	var elements = document.getElementsByTagName('a');
	hashes = [];
	for (var i = 0; i < elements.length; i++) {
	if (elements[i].hash !== '' && elements[i].href.indexOf(location.href) === 0) {
	hashes.push(elements[i].hash);
	}
	}
	return hashes
	});

	hashes.forEach(function (hash) {
	if (parsedLinks.indexOf(url + hash) == -1) {
	checkPage(require('webpage').create(), url, hash);
	}
	});
	}, 2000)
	});
	};
	//start the crawl
	checkPage(require('webpage').create(), baseUrl, '');