Created
November 5, 2015 22:16
-
-
Save leongersen/70b8d85979a62f4a9b31 to your computer and use it in GitHub Desktop.
URL finder using fetch and promises. Logs a list of URLs on a domain to the console.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<script> | |
// Do anything in this function. | |
function isOkForDomain (a) { | |
a = a.toLowerCase(); | |
return a.indexOf('/booking/') === -1 && | |
a.indexOf('/beschikbaarheden/') === -1 && | |
a.indexOf('/reviews/') === -1 && | |
a.indexOf('/prijzen/') === -1 && | |
a.indexOf('/media/') === -1; | |
} | |
</script> | |
<div id="count"></div> | |
<script>var DOMAIN = 'http://vialora.vp.local';</script> | |
<script src="fetch.js"></script> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function isOk(a){ | |
a = a.toLowerCase(); | |
return !a.endsWith('.ico') && | |
!a.endsWith('.js') && | |
!a.endsWith('.css') && | |
!a.endsWith('.jpg') && | |
!a.endsWith('.jpeg') && | |
!a.endsWith('.pdf') && | |
!a.endsWith('.gif') && | |
!a.endsWith('.png'); | |
} | |
// Will contain all urls | |
var handled = [], | |
// Current number of pending requests | |
crawling = 0, | |
// Show the current number of open requests in de document | |
count = document.getElementById('count'), | |
// We're not parsing HTML. | |
// Try to find everything between quotes, starting with '/' or 'http'. | |
// Matches 'https' too. | |
urlFinder = new RegExp('"((?:/|http)[^\"]+)"', "g"), | |
// Find the host from the crawled domain | |
HOST, PROTOCOL; | |
(function(d){ | |
var p = document.createElement('a'); | |
p.href = d; | |
HOST = p.hostname; | |
PROTOCOL = p.protocol; | |
}(DOMAIN)); | |
function crawl ( crawl_url ) { | |
crawling++; | |
count.innerText = crawling; | |
handled.push(crawl_url); | |
fetch(crawl_url, { | |
method: 'get' | |
}).catch(function(reason) { | |
console.log('Caught failure: ' + reason); | |
}).then(function(response) { | |
return response.text(); | |
}).then(function(text) { | |
return text.match(urlFinder); | |
}).then(function(urls) { | |
urls.forEach(function(url){ | |
var p = document.createElement('a'); | |
p.href = url.slice(1, -1); | |
url = PROTOCOL + '//' + HOST + p.pathname; | |
if ( handled.indexOf(url) === -1 && (p.hostname === 'localhost' || p.hostname === HOST) && isOk(p.pathname) && isOkForDomain(p.pathname) ) { | |
crawl(url); | |
} | |
}); | |
}).then(function(filtered) { | |
crawling--; | |
count.innerText = crawling; | |
if ( !crawling ) { | |
console.log('Done. Found ' + handled.length + ' urls.'); | |
console.log(handled.join('\n')); | |
} | |
}); | |
} | |
crawl(DOMAIN); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You'll have to run a browser without web security to crawl external domains:
chrome.exe --disable-web-security
.