Created
July 27, 2010 14:16
-
-
Save pyropeter/492270 to your computer and use it in GitHub Desktop.
Find duplicate files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
var sys = require('sys'), | |
fs = require('fs'), | |
childProcess = require('child_process') | |
hashtable = {} | |
md5table = {} | |
fileblacklist = [] | |
filequeue = [] | |
md5queue = [] | |
threadNum = 2 // this tends to be equivalent to the load avarage. | |
threadsWaiting = 0 | |
stated = 0 | |
hashed = 0 | |
samesize = 0 | |
duplicates = 0 | |
debug = false | |
function fill(text, width) { | |
text = String(text) | |
while (text.length < width) text = " " + text | |
return text | |
} | |
statistics = setInterval(function () { | |
sys.print( | |
"\rHashed: " + fill(hashed,6) + | |
"/" + fill(stated,6) + | |
" Queues: " + fill(md5queue.length,1) + | |
"/" + fill(filequeue.length,3) + | |
" Threads: " + fill(threadNum-threadsWaiting,1) + | |
"/" + fill(threadNum,1) + | |
" Dup: " + fill(duplicates,3) + | |
"/" + fill(samesize,6)) | |
if (!threadNum) { | |
clearInterval(statistics) | |
sys.puts("") | |
} | |
}, 300) | |
function handleEntry () { | |
// test if files should be md5-hashed | |
var file = md5queue.pop() | |
if (typeof file == 'string') { | |
debug && dupstream.write("Hashing file " + file + "\n") | |
childProcess.exec('md5sum "' + file + '"', function | |
(error, stdout, stderr) { | |
hashed++ | |
var md5sum = /^([a-z0-9]*)/.exec(stdout)[1] | |
if (md5table[md5sum]) { | |
duplicates++ | |
md5table[md5sum].push(file) | |
dupstream.write("Duplicates: " + | |
md5table[md5sum].join(", ") + "\n") | |
} else { | |
md5table[md5sum] = [file] | |
} | |
process.nextTick(handleEntry) | |
}) | |
return | |
} | |
// check if waiting is neccessary | |
file = filequeue.pop() | |
if (typeof file != 'string') { | |
debug && dupstream.write("Waiting.\n") | |
threadsWaiting++ | |
if (threadsWaiting == threadNum) { | |
threadsWaiting-- | |
threadNum-- | |
return | |
} | |
setTimeout(function () { | |
threadsWaiting-- | |
process.nextTick(handleEntry) | |
}, 100) | |
return | |
} | |
// check blacklist | |
for (var i in fileblacklist) { | |
if (file == fileblacklist[i]) { | |
fileblacklist.splice(i, 1) | |
process.nextTick(handleEntry) | |
return | |
} | |
} | |
// stat path | |
debug && dupstream.write("Checking file " + file + "\n") | |
fs.stat(file, function (err, stats) { | |
if (!err) { | |
stated++ | |
if (stats.isFile()) { | |
if (hashtable[stats.size]) { | |
samesize++ | |
if (hashtable[stats.size].length == 1) { | |
samesize++ | |
md5queue.push(hashtable[stats.size][0]) | |
} | |
md5queue.push(file) | |
hashtable[stats.size].push(file) | |
} else { | |
hashtable[stats.size] = [file] | |
} | |
} else if (stats.isDirectory()) { | |
fs.readdir(file, function (err, files) { | |
if (!err) { | |
for (var i in files) | |
filequeue.push(file + "/" + files[i]) | |
} | |
process.nextTick(handleEntry) | |
}) | |
return | |
} | |
} | |
process.nextTick(handleEntry) | |
}) | |
} | |
if (process.argv.length < 4) { | |
sys.puts("") | |
sys.puts("Usage: node this.js" + | |
" <outfile> <path> [...] [-x <path> [...]]") | |
sys.puts("") | |
sys.puts("Pathes of duplicate files are written to <outfile>") | |
sys.puts("All pathes are searched recursivly") | |
sys.puts("-x <path> excludes path from beeing searched") | |
sys.puts("") | |
process.exit(1) | |
} | |
dupstream = fs.createWriteStream(process.argv[2]) | |
for (var i = 3 ; i < process.argv.length ; i++) { | |
if (process.argv[i] == "-x") { | |
i++ | |
fileblacklist.push(process.argv[i].replace(/\/$/, "")) | |
} else | |
filequeue.push(process.argv[i].replace(/\/$/, "")) | |
} | |
for (var i = 0 ; i < threadNum ; i++) | |
handleEntry() |
Jo, war mir auch ganz neu.
Da habe ich jetzt jahre gebraucht, um mir das semikolon anzugewöhnen, und jetzt brauche ich es nicht mehr :-P
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Der Code erinnerte mich so an Python.
Wusste gar nicht, dass JS keine Strichpunkte hat :D