Last active
August 11, 2019 11:47
-
-
Save m1k1o/9fc2603f9407c466ac47edb1bb7dae62 to your computer and use it in GitHub Desktop.
This simpe Node.Js script crawls well known Apache's Index Of page and gets files tree with filesizes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const spawn = require("child_process").spawn; | |
function parseSize(size) { | |
if(size == '-') return null; | |
let int = parseFloat(size.replace(/[^0-9\.]/g, '')); | |
if(/^[0-9\.]+$/.test(size)) { | |
return Math.round(int); | |
} | |
if(/^[0-9\.]+K$/.test(size)) { | |
return Math.round(int * 1024); | |
} | |
if(/^[0-9\.]+M$/.test(size)) { | |
return Math.round(int * 1024 * 1024); | |
} | |
if(/^[0-9\.]+G$/.test(size)) { | |
return Math.round(int * 1024 * 1024 * 1024); | |
} | |
if(/^[0-9\.]+T$/.test(size)) { | |
return Math.round(int * 1024 * 1024 * 1024 * 1024); | |
} | |
return size; | |
} | |
var qs = require('querystring'); | |
function parse(content) { | |
let lines = [...content.match(/<tr>(.*?)<\/tr>/g)]; | |
let matches = { folders: [], files: [] }; | |
for(let line of lines) { | |
let match = line.match(/<td.*?><img src="\/icons\/(?<icon>.*?).gif".*?><\/td><td.*?><a href="\s*(?<url>.*?)\s*">\s*(?<name>.*?)\s*<\/a><\/td><td.*?>\s*(?<date>.*?)\s*<\/td><td.*?>\s*(?<size>.*?)\s*<\/td><td.*?>.*?<\/td>/); | |
if(match === null || match.groups.icon == 'back') continue; | |
let g = match.groups; | |
g.size = parseSize(g.size); | |
g.url = g.url.replace(/&/g, "&"); | |
if(g.icon == "folder") { | |
matches.folders.push(g) | |
continue; | |
} | |
matches.files.push(g) | |
} | |
return matches; | |
} | |
function sumSizes(files) { | |
return files.reduce((prev, el) => prev + el.size, 0) | |
} | |
function isPageValid(content) { | |
return /<title>Index of .*<\/title>/.test(content) && /<h1>Index of .*<\/h1>/.test(content) | |
} | |
function getPage(url) { | |
return new Promise((resolve, reject) => { | |
let child = spawn('curl', [ url ]); | |
let scriptOutput = ""; | |
child.stdout.setEncoding('utf8'); | |
child.stdout.on('data', function(data) { | |
data = data.toString(); | |
scriptOutput += data; | |
}); | |
child.on('close', function(code) { | |
resolve(scriptOutput) | |
}); | |
}); | |
} | |
var fs = require('fs'); | |
if(process.argv.length == 2) { | |
console.log("\nUsage: "); | |
console.log("node index.js url [outputFile] [queueFile]"); | |
console.log("\nExample: "); | |
console.log("node index.js http://localhost"); | |
console.log("node index.js http://localhost files.txt queue.json"); | |
console.log("node index.js http://localhost/media files.txt queue.json"); | |
console.log("\n[outputFile] is the script result."); | |
console.log("[queueFile] saves current state."); | |
return ; | |
} | |
(async function ({ url, entrypoints = [""], outputFile = 'files.txt', queueFile = '' }) { | |
console.log("url:\t" + url + "\toutputFile:\t" + outputFile + "\tqueueFile:\t" + queueFile + "\n"); | |
let stats = { | |
bytes: 0, | |
files: 0, | |
folders: 0, | |
}; | |
// Read queue | |
let queue = entrypoints; | |
if(queueFile) { | |
queue = await new Promise((res, rej) => { | |
fs.readFile(queueFile, function (err, data) { | |
if (err) res(null); | |
res(data); | |
}); | |
}); | |
queue = JSON.parse(queue); | |
if(!queue) queue = entrypoints; | |
} | |
while(queue.length > 0) { | |
let path = queue.pop(); | |
let html = await getPage(url + path); | |
if(!isPageValid(html)) { | |
console.log("Invalid page\t" + path); | |
continue; | |
} | |
let data = parse(html); | |
let bytes = sumSizes(data.files); | |
stats.bytes += bytes; | |
stats.files += data.files.length; | |
stats.folders += data.folders.length; | |
// Append files | |
let files = data.files.map(file => path + file.url + "\t" + file.size).join("\n"); | |
if(files) { | |
await new Promise((res, rej) => { | |
fs.appendFile(outputFile, files + "\n", function (err) { | |
if (err) rej(); | |
res(); | |
}); | |
}); | |
} | |
queue = [...queue, ...data.folders.map(folder => path + folder.url)]; | |
console.log("Queue:\t" + queue.length + "\tDirs:\t" + data.folders.length + "\tFiles:\t" + data.files.length + "\tBytes:\t" + bytes + "\t" + path ); | |
// Save queue | |
if(queueFile) { | |
await new Promise((res, rej) => { | |
fs.writeFile(queueFile, JSON.stringify(queue), function (err) { | |
if (err) rej(); | |
res(); | |
}); | |
}); | |
} | |
}; | |
console.log("---TOTAL:---"); | |
console.log("Dirs:\t" + stats.folders + "\tFiles:\t" + stats.files + "\tBytes:\t" + stats.bytes + "\t" + url + "\n"); | |
}({ | |
url: process.argv[2], | |
outputFile: process.argv[3], | |
queueFile: process.argv[4] | |
})) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment