Skip to content

Instantly share code, notes, and snippets.

@pyropeter
Created July 27, 2010 14:16
Show Gist options
  • Save pyropeter/492270 to your computer and use it in GitHub Desktop.
Save pyropeter/492270 to your computer and use it in GitHub Desktop.
Find duplicate files
#!/usr/bin/env node
var sys = require('sys'),
fs = require('fs'),
childProcess = require('child_process')
hashtable = {}
md5table = {}
fileblacklist = []
filequeue = []
md5queue = []
threadNum = 2 // this tends to be equivalent to the load avarage.
threadsWaiting = 0
stated = 0
hashed = 0
samesize = 0
duplicates = 0
debug = false
function fill(text, width) {
text = String(text)
while (text.length < width) text = " " + text
return text
}
statistics = setInterval(function () {
sys.print(
"\rHashed: " + fill(hashed,6) +
"/" + fill(stated,6) +
" Queues: " + fill(md5queue.length,1) +
"/" + fill(filequeue.length,3) +
" Threads: " + fill(threadNum-threadsWaiting,1) +
"/" + fill(threadNum,1) +
" Dup: " + fill(duplicates,3) +
"/" + fill(samesize,6))
if (!threadNum) {
clearInterval(statistics)
sys.puts("")
}
}, 300)
function handleEntry () {
// test if files should be md5-hashed
var file = md5queue.pop()
if (typeof file == 'string') {
debug && dupstream.write("Hashing file " + file + "\n")
childProcess.exec('md5sum "' + file + '"', function
(error, stdout, stderr) {
hashed++
var md5sum = /^([a-z0-9]*)/.exec(stdout)[1]
if (md5table[md5sum]) {
duplicates++
md5table[md5sum].push(file)
dupstream.write("Duplicates: " +
md5table[md5sum].join(", ") + "\n")
} else {
md5table[md5sum] = [file]
}
process.nextTick(handleEntry)
})
return
}
// check if waiting is neccessary
file = filequeue.pop()
if (typeof file != 'string') {
debug && dupstream.write("Waiting.\n")
threadsWaiting++
if (threadsWaiting == threadNum) {
threadsWaiting--
threadNum--
return
}
setTimeout(function () {
threadsWaiting--
process.nextTick(handleEntry)
}, 100)
return
}
// check blacklist
for (var i in fileblacklist) {
if (file == fileblacklist[i]) {
fileblacklist.splice(i, 1)
process.nextTick(handleEntry)
return
}
}
// stat path
debug && dupstream.write("Checking file " + file + "\n")
fs.stat(file, function (err, stats) {
if (!err) {
stated++
if (stats.isFile()) {
if (hashtable[stats.size]) {
samesize++
if (hashtable[stats.size].length == 1) {
samesize++
md5queue.push(hashtable[stats.size][0])
}
md5queue.push(file)
hashtable[stats.size].push(file)
} else {
hashtable[stats.size] = [file]
}
} else if (stats.isDirectory()) {
fs.readdir(file, function (err, files) {
if (!err) {
for (var i in files)
filequeue.push(file + "/" + files[i])
}
process.nextTick(handleEntry)
})
return
}
}
process.nextTick(handleEntry)
})
}
if (process.argv.length < 4) {
sys.puts("")
sys.puts("Usage: node this.js" +
" <outfile> <path> [...] [-x <path> [...]]")
sys.puts("")
sys.puts("Pathes of duplicate files are written to <outfile>")
sys.puts("All pathes are searched recursivly")
sys.puts("-x <path> excludes path from beeing searched")
sys.puts("")
process.exit(1)
}
dupstream = fs.createWriteStream(process.argv[2])
for (var i = 3 ; i < process.argv.length ; i++) {
if (process.argv[i] == "-x") {
i++
fileblacklist.push(process.argv[i].replace(/\/$/, ""))
} else
filequeue.push(process.argv[i].replace(/\/$/, ""))
}
for (var i = 0 ; i < threadNum ; i++)
handleEntry()
@pyropeter
Copy link
Author

das ist aber javascript :-P
wahrscheinlich müsste das da heißen:
#!/usr/bin/env node

@sebix
Copy link

sebix commented Jul 28, 2010

Der Code erinnerte mich so an Python.
Wusste gar nicht, dass JS keine Strichpunkte hat :D

@pyropeter
Copy link
Author

Jo, war mir auch ganz neu.
Da habe ich jetzt jahre gebraucht, um mir das semikolon anzugewöhnen, und jetzt brauche ich es nicht mehr :-P

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment