Last active
May 23, 2017 16:15
-
-
Save leodutra/3cd39249315b37da7dba to your computer and use it in GitHub Desktop.
Simple JavaScript Cralwer (js, crawler, javascript)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
JavaScript Link Crawler | |
author: Leonardo Dutra ([email protected]) | |
Instructions: open browser console, paste, run. | |
*/ | |
var limit = 30000; // limite de links encontrados | |
var linkHolder = {}; // objeto usado como hash de links | |
var SCPReferenciado = []; | |
var visitedDomains = []; | |
var success = found = executionTime = 0; | |
var currentDomain = null; | |
var stopped = false; | |
function getLinks(html, origin) { | |
html = removeComments(html); | |
// regex que encontra links href, src e do JavaScript | |
var matches = html.match(/(?:\b(?:href|src)[^=]*=|["'>])[^"'<>]+?\.(?:html|php|asp|do|jsp|htm)\b[^"'<>\s]*/gim) | |
if (matches) { | |
var i = matches.length; | |
while (i--) { | |
// remove espaços, quebra de linha e quotes dos links encontrados | |
matches[i] = matches[i].replace(/["\s'>\r\n]+|^(?:href|src)[^=]*=/gim, ''); | |
} | |
return relativeToAbsolute(matches, origin); | |
} | |
return null; | |
} | |
// remove comentarios JS e HTML | |
function removeComments(html) { | |
// HTML <!--[\S\s]*?--> | |
// JS /**/ \/\*(?:[^*] | \*[^/])*?\*\/ | |
// JS // ([^:])\/\/.* | |
return html.replace(/(?:<!-[^>]*>|\/\*(?:[^*]|\*[^\/])*?\*\/)/gm, '') | |
.replace(/([^:])\/\/.*/gm, '$1'); // TODO IMPROVE "//" in case of not a actual comment | |
} | |
// obtém uma simulação do objeto location com a uri definida | |
function getLocationInfo(uri) { | |
var a = document.createElement('a'); | |
a.href = uri; | |
return a; | |
} | |
// extrai domínio de uma uri | |
function getDomain(uri) { | |
return uri.match(/^\w+:\/\/[^\/]+/)[0]; | |
} | |
// tranforma caminhos relativos em uri completas | |
function relativeToAbsolute(links, origin) { | |
var absURI = getAbsolutePath(origin) + '/'; | |
var domain = getDomain(origin) + '/'; | |
var i = links.length; | |
var link; | |
while (i--) { | |
link = links[i]; | |
if (link.search(/^\w+:\/\//i) === -1) { | |
link = (link.charAt(0) === '/' ? domain : absURI) + link; | |
} | |
links[i] = link.replace(/([^:])\/\/+/g, '$1/'); | |
} | |
return links; | |
} | |
// function backTrackURI(uri) { | |
// return uri.replace(/(?:http[s]?:\/)?\/*?[^\/]+?\/?$/im, ''); | |
// } | |
// obtem caminho absoluto de uma uri | |
function getAbsolutePath(uri) { | |
return uri.match(/^\w+:\/\/[^\/]+\/*(?:[^\/\.]+(?:\/+|\r)|\.\.\/)*/)[0]; | |
} | |
// retorna link nao visitado seguindo prioridade de dominio | |
function getUnvisitedURI() { | |
for (var i = 0, l = visitedDomains.length, visitedDomain; i < l; ++i) { | |
visitedDomain = visitedDomains[i]; | |
for (var link in linkHolder) { | |
if (~link.indexOf(visitedDomain) && linkHolder[link].status === '_') { | |
return link; | |
} | |
} | |
} | |
if (currentDomain) { | |
for (var link in linkHolder) { | |
if (~link.indexOf(currentDomain) && linkHolder[link].status === '_') return link; | |
} | |
visitedDomains.push(currentDomain); | |
} | |
for (var link in linkHolder) { | |
if (linkHolder[link].status === '_') { | |
currentDomain = getLocationInfo(link).hostname; | |
return link; | |
} | |
} | |
return null; | |
} | |
// definicao de classe de informacao de link para melhor perfomance | |
function LinkInfo(origin) { this.origin = origin; }; | |
LinkInfo.prototype = { | |
status: '_', | |
origin: '' | |
}; | |
// adiciona links ao hash de controle | |
function pushLinks(links, origin) { | |
if (links) { | |
var i = links.length; | |
var link; | |
while (i--) { | |
if (linkHolder[links[i]]) continue; | |
linkHolder[links[i]] = new LinkInfo(origin); | |
++found; | |
} | |
} | |
} | |
function getExecutionTime() { | |
return new Date(Date.now() - executionTime).toISOString().match(/([^T]*)Z$/)[1]; | |
} | |
function toLink(href) { | |
return '<a href="' + href + '" target="_blank">' + href + '</a>'; | |
} | |
// mostra status simplificado e retorna links por categoria | |
function status() { | |
var visited = []; | |
var broken = []; | |
var unvisited = []; | |
var redirected = []; | |
for (var link in linkHolder) { | |
switch (linkHolder[link].status) { | |
case '_': | |
type = unvisited; | |
break; | |
case 'V': | |
type = visited; | |
break; | |
case 'X': | |
type = broken; | |
break; | |
case 'R': | |
type = redirected; | |
break; | |
} | |
type.push(linkHolder[link].status + ' ' + toLink(link) +'<span> '+ toLink(linkHolder[link].origin)+'</span>'); | |
} | |
console.log([ | |
'Execution time: ' + getExecutionTime(), | |
found + ' found', | |
unvisited.length + ' unvisited', | |
visited.length + ' visited', | |
redirected.length + ' redirected', | |
broken.length + ' broken' | |
].join('\n')); | |
return { | |
broken: broken, | |
visited: visited, | |
unvisited: unvisited, | |
redirected: redirected | |
}; | |
} | |
// exibe links em popup para impressao | |
function showLinks() { | |
var data = status(); | |
var br = '<br/>'; | |
var logInfo = [ | |
'Execution time: ' + getExecutionTime(), | |
found + ' found', | |
data.unvisited.length + ' unvisited', | |
data.visited.length + ' visited', | |
data.redirected.length + ' redirected', | |
data.broken.length + ' broken', | |
br, | |
'### BROKEN: ' + data.broken.length, | |
data.broken.sort().join(br), | |
br, | |
'### REDIRECTED: ' + data.redirected.length, | |
data.redirected.sort().join(br), | |
br, | |
'### VISITED: ' + data.visited.length, | |
data.visited.sort().join(br), | |
br, | |
'### UNVISITED: ' + data.unvisited.length, | |
data.unvisited.sort().join(br) | |
]; | |
var popup = open(null, '_blank'); | |
if (popup) { | |
popup.document.write( | |
'<head><style>a {color: #555;text-decoration: none;} span a {color: #bbb;}</style></head>'+ | |
'<body>'+ | |
'<div style="white-space:nowrap;font-size: 12px; font-family: Consolas,\'Lucida Console\',\'DejaVu Sans Mono\',monospace;">' + | |
logInfo.join(br) + | |
'</pre></div></body>' | |
); | |
} | |
else { | |
alert('Popup bloqueado.') | |
} | |
} | |
// visita determinado link e extrai outros links (crawl) | |
function visitLink(link) { | |
if (link) { | |
jQuery.ajax({ | |
url: link | |
//,xhrFields: { | |
// withCredentials: true | |
//} | |
}).fail(function (jqXHR, textStatus, errorThrown) { | |
linkHolder[link].status = 'X'; | |
run() | |
}).done(function (data, textStatus, jqXHR) { | |
++success; | |
linkHolder[link].status = 'V'; | |
if (typeof data==='string') pushLinks(getLinks(data, link), link); | |
run(); | |
}); | |
} | |
else console.log('FINISHED (no more links to crawl)'); | |
} | |
function run() { | |
if (success < limit && !stopped) visitLink(getUnvisitedURI()); | |
else { | |
stopped = true; | |
showLinks(); | |
limit = (success / limit >> 0) * limit + limit; | |
} | |
} | |
function start(limit) { | |
stopped = false; | |
console.log('RUNNING...\nUse status() and showLinks()'); | |
executionTime = Date.now(); | |
run(); | |
} | |
// importa jQuery necessaria para o crawler | |
function importScript(src) { | |
var el = document.createElement('script'); | |
el.type = 'text/javascript'; | |
el.src = src; | |
(document.head || document.body).appendChild(el); | |
} | |
// START | |
importScript('//ajax.googleapis.com/ajax/libs/jquery/1.8.1/jquery.min.js'); | |
pushLinks([location.href]); | |
setTimeout(start, 2000); // 2 seg de aguardo pelo import | |
//'limit:'+limit; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment