leongersen · November 5, 2015 22:16 · leongersen · Nov 5, 2015 · leongersen · Nov 5, 2015
diff --git a/fetch.html b/fetch.html
 <!DOCTYPE html>
 <script>

 	// Do anything in this function.
 	function isOkForDomain (a) {
 		a = a.toLowerCase();
 		return a.indexOf('/booking/') === -1 &&
 			a.indexOf('/beschikbaarheden/') === -1 &&
 			a.indexOf('/reviews/') === -1 &&
 			a.indexOf('/prijzen/') === -1 &&
 			a.indexOf('/media/') === -1;
 	}

 </script>
 <div id="count"></div>
 <script>var DOMAIN = 'http://vialora.vp.local';</script>
 <script src="fetch.js"></script>
diff --git a/fetch.js b/fetch.js

 	function isOk(a){
 		a = a.toLowerCase();
 		return !a.endsWith('.ico') &&
 			!a.endsWith('.js') &&
 			!a.endsWith('.css') &&
 			!a.endsWith('.jpg') &&
 			!a.endsWith('.jpeg') &&
 			!a.endsWith('.pdf') &&
 			!a.endsWith('.gif') &&
 			!a.endsWith('.png');
 	}

 		// Will contain all urls
 	var handled = [],
 		// Current number of pending requests
 		crawling = 0,
 		// Show the current number of open requests in de document
 		count = document.getElementById('count'),
 		// We're not parsing HTML.
 		// Try to find everything between quotes, starting with '/' or 'http'.
 		// Matches 'https' too.
 		urlFinder = new RegExp('"((?:/|http)[^\"]+)"', "g"),
 		// Find the host from the crawled domain
 		HOST, PROTOCOL;

 	(function(d){
 		var p = document.createElement('a');
 			p.href = d;
 			HOST = p.hostname;
 			PROTOCOL = p.protocol;
 	}(DOMAIN));

 	function crawl ( crawl_url ) {

 		crawling++;
 		count.innerText = crawling;

 		handled.push(crawl_url);

 		fetch(crawl_url, {
 			method: 'get'
 		}).catch(function(reason) {
 			console.log('Caught failure: ' + reason);
 		}).then(function(response) {
 			return response.text();
 		}).then(function(text) {
 			return text.match(urlFinder);
 		}).then(function(urls) {

 			urls.forEach(function(url){
 				var p = document.createElement('a');
 				p.href = url.slice(1, -1);
 				url = PROTOCOL + '//' + HOST + p.pathname;

 				if ( handled.indexOf(url) === -1 && (p.hostname === 'localhost' || p.hostname === HOST) && isOk(p.pathname) && isOkForDomain(p.pathname) ) {
 					crawl(url);
 				}
 			});

 		}).then(function(filtered) {

 			crawling--;
 			count.innerText = crawling;

 			if ( !crawling ) {
 				console.log('Done. Found ' + handled.length + ' urls.');
 				console.log(handled.join('\n'));
 			}
 		});
 	}

 	crawl(DOMAIN);
	<!DOCTYPE html>
	<script>

	// Do anything in this function.
	function isOkForDomain (a) {
	a = a.toLowerCase();
	return a.indexOf('/booking/') === -1 &&
	a.indexOf('/beschikbaarheden/') === -1 &&
	a.indexOf('/reviews/') === -1 &&
	a.indexOf('/prijzen/') === -1 &&
	a.indexOf('/media/') === -1;
	}

	</script>
	<div id="count"></div>
	<script>var DOMAIN = 'http://vialora.vp.local';</script>
	<script src="fetch.js"></script>

	function isOk(a){
	a = a.toLowerCase();
	return !a.endsWith('.ico') &&
	!a.endsWith('.js') &&
	!a.endsWith('.css') &&
	!a.endsWith('.jpg') &&
	!a.endsWith('.jpeg') &&
	!a.endsWith('.pdf') &&
	!a.endsWith('.gif') &&
	!a.endsWith('.png');
	}

	// Will contain all urls
	var handled = [],
	// Current number of pending requests
	crawling = 0,
	// Show the current number of open requests in de document
	count = document.getElementById('count'),
	// We're not parsing HTML.
	// Try to find everything between quotes, starting with '/' or 'http'.
	// Matches 'https' too.
	urlFinder = new RegExp('"((?:/\|http)[^\"]+)"', "g"),
	// Find the host from the crawled domain
	HOST, PROTOCOL;

	(function(d){
	var p = document.createElement('a');
	p.href = d;
	HOST = p.hostname;
	PROTOCOL = p.protocol;
	}(DOMAIN));

	function crawl ( crawl_url ) {

	crawling++;
	count.innerText = crawling;

	handled.push(crawl_url);

	fetch(crawl_url, {
	method: 'get'
	}).catch(function(reason) {
	console.log('Caught failure: ' + reason);
	}).then(function(response) {
	return response.text();
	}).then(function(text) {
	return text.match(urlFinder);
	}).then(function(urls) {

	urls.forEach(function(url){
	var p = document.createElement('a');
	p.href = url.slice(1, -1);
	url = PROTOCOL + '//' + HOST + p.pathname;

	if ( handled.indexOf(url) === -1 && (p.hostname === 'localhost' \|\| p.hostname === HOST) && isOk(p.pathname) && isOkForDomain(p.pathname) ) {
	crawl(url);
	}
	});

	}).then(function(filtered) {

	crawling--;
	count.innerText = crawling;

	if ( !crawling ) {
	console.log('Done. Found ' + handled.length + ' urls.');
	console.log(handled.join('\n'));
	}
	});
	}

	crawl(DOMAIN);