Skip to content

Instantly share code, notes, and snippets.

@m1k1o
Last active August 11, 2019 11:47
Show Gist options
  • Save m1k1o/9fc2603f9407c466ac47edb1bb7dae62 to your computer and use it in GitHub Desktop.
Save m1k1o/9fc2603f9407c466ac47edb1bb7dae62 to your computer and use it in GitHub Desktop.
This simpe Node.Js script crawls well known Apache's Index Of page and gets files tree with filesizes.
const spawn = require("child_process").spawn;
function parseSize(size) {
if(size == '-') return null;
let int = parseFloat(size.replace(/[^0-9\.]/g, ''));
if(/^[0-9\.]+$/.test(size)) {
return Math.round(int);
}
if(/^[0-9\.]+K$/.test(size)) {
return Math.round(int * 1024);
}
if(/^[0-9\.]+M$/.test(size)) {
return Math.round(int * 1024 * 1024);
}
if(/^[0-9\.]+G$/.test(size)) {
return Math.round(int * 1024 * 1024 * 1024);
}
if(/^[0-9\.]+T$/.test(size)) {
return Math.round(int * 1024 * 1024 * 1024 * 1024);
}
return size;
}
var qs = require('querystring');
function parse(content) {
let lines = [...content.match(/<tr>(.*?)<\/tr>/g)];
let matches = { folders: [], files: [] };
for(let line of lines) {
let match = line.match(/<td.*?><img src="\/icons\/(?<icon>.*?).gif".*?><\/td><td.*?><a href="\s*(?<url>.*?)\s*">\s*(?<name>.*?)\s*<\/a><\/td><td.*?>\s*(?<date>.*?)\s*<\/td><td.*?>\s*(?<size>.*?)\s*<\/td><td.*?>.*?<\/td>/);
if(match === null || match.groups.icon == 'back') continue;
let g = match.groups;
g.size = parseSize(g.size);
g.url = g.url.replace(/&amp;/g, "&");
if(g.icon == "folder") {
matches.folders.push(g)
continue;
}
matches.files.push(g)
}
return matches;
}
function sumSizes(files) {
return files.reduce((prev, el) => prev + el.size, 0)
}
function isPageValid(content) {
return /<title>Index of .*<\/title>/.test(content) && /<h1>Index of .*<\/h1>/.test(content)
}
function getPage(url) {
return new Promise((resolve, reject) => {
let child = spawn('curl', [ url ]);
let scriptOutput = "";
child.stdout.setEncoding('utf8');
child.stdout.on('data', function(data) {
data = data.toString();
scriptOutput += data;
});
child.on('close', function(code) {
resolve(scriptOutput)
});
});
}
var fs = require('fs');
if(process.argv.length == 2) {
console.log("\nUsage: ");
console.log("node index.js url [outputFile] [queueFile]");
console.log("\nExample: ");
console.log("node index.js http://localhost");
console.log("node index.js http://localhost files.txt queue.json");
console.log("node index.js http://localhost/media files.txt queue.json");
console.log("\n[outputFile] is the script result.");
console.log("[queueFile] saves current state.");
return ;
}
(async function ({ url, entrypoints = [""], outputFile = 'files.txt', queueFile = '' }) {
console.log("url:\t" + url + "\toutputFile:\t" + outputFile + "\tqueueFile:\t" + queueFile + "\n");
let stats = {
bytes: 0,
files: 0,
folders: 0,
};
// Read queue
let queue = entrypoints;
if(queueFile) {
queue = await new Promise((res, rej) => {
fs.readFile(queueFile, function (err, data) {
if (err) res(null);
res(data);
});
});
queue = JSON.parse(queue);
if(!queue) queue = entrypoints;
}
while(queue.length > 0) {
let path = queue.pop();
let html = await getPage(url + path);
if(!isPageValid(html)) {
console.log("Invalid page\t" + path);
continue;
}
let data = parse(html);
let bytes = sumSizes(data.files);
stats.bytes += bytes;
stats.files += data.files.length;
stats.folders += data.folders.length;
// Append files
let files = data.files.map(file => path + file.url + "\t" + file.size).join("\n");
if(files) {
await new Promise((res, rej) => {
fs.appendFile(outputFile, files + "\n", function (err) {
if (err) rej();
res();
});
});
}
queue = [...queue, ...data.folders.map(folder => path + folder.url)];
console.log("Queue:\t" + queue.length + "\tDirs:\t" + data.folders.length + "\tFiles:\t" + data.files.length + "\tBytes:\t" + bytes + "\t" + path );
// Save queue
if(queueFile) {
await new Promise((res, rej) => {
fs.writeFile(queueFile, JSON.stringify(queue), function (err) {
if (err) rej();
res();
});
});
}
};
console.log("---TOTAL:---");
console.log("Dirs:\t" + stats.folders + "\tFiles:\t" + stats.files + "\tBytes:\t" + stats.bytes + "\t" + url + "\n");
}({
url: process.argv[2],
outputFile: process.argv[3],
queueFile: process.argv[4]
}))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment