Skip to content

Instantly share code, notes, and snippets.

@yak1ex
Last active March 6, 2018 11:41
Show Gist options
  • Select an option

  • Save yak1ex/c90aed92a920f043e8a388871a0233c5 to your computer and use it in GitHub Desktop.

Select an option

Save yak1ex/c90aed92a920f043e8a388871a0233c5 to your computer and use it in GitHub Desktop.
dupchecker
node_modules/
*.BAK
'use strict'
// TODO: error handling
const Promise = require('bluebird')
const fs = Promise.promisifyAll(require('fs-extra'))
const path = require('path')
const Database = require('better-sqlite3')
const crypto = require('crypto')
const commandLineArgs = require('command-line-args')
const getUsage = require('command-line-usage')
const usageSpec = [
{
header: 'dupcheck.js',
content: 'File duplication checker'
},
{
header: 'Synopsis',
content: [
{ colA: '$', colB: 'dupcheck.js --db <dbfile> <target_dirs>...' },
{ colA: '$', colB: 'dupcheck.js --db <dbfile> --target <target_dir>...' },
{ colA: '$', colB: 'dupcheck.js --db <dbfile> --target <target_dir> --target <target_dir>...' },
{ colA: '$', colB: 'dupcheck.js --db <dbfile>' },
{ colA: '$', colB: 'dupcheck.js --db <dbfile> --list' },
{ colA: '$', colB: 'dupcheck.js --help' }
]
},
{
header: 'Options',
optionList: [
{ name: 'db', alias: 'd', type: String, typeLabel: 'dbfile', description: 'A hash db filename' },
{ name: 'target', alias: 't', type: String, typeLabel: 'target_dir', multiple: true, defaultOption: true, description: 'Target folders' },
{ name: 'list', alias: 'l', type: Boolean, description: 'Show recorded target folders' },
{ name: 'help', alias: 'h', type: Boolean, description: 'Show help' }
]
},
{
header: 'Notes',
content: [
'Specified target folders are recorded in db.',
'You can omit target options if they are already recorded.'
]
}
]
const options = commandLineArgs(usageSpec[2].optionList)
if (options.help || options.db === undefined) {
console.log(getUsage(usageSpec))
process.exit(1)
}
let db = new Database(options.db)
if(db.prepare('SELECT COUNT(*) FROM sqlite_master WHERE tbl_name = "files"').get()['COUNT(*)'] === 0) { // init
db.exec('CREATE TABLE files (filename TEXT PRIMARY KEY, size INTEGER, hash TEXT, mtime INTEGER)')
db.exec('CREATE INDEX files_idx on files (size, hash)')
db.exec('CREATE TABLE targets (target TEXT PRIMARY KEY)')
}
const psWalk = db.prepare('SELECT * FROM files')
const psDelete = db.prepare('DELETE FROM files WHERE filename = ?')
const psInsert = db.prepare('INSERT INTO files VALUES(?,?,?,?)')
const psInsertWoHash = db.prepare('INSERT INTO files (filename, size, mtime) VALUES(?,?,?)')
const psUpdate = db.prepare('UPDATE files SET size = ?, hash = ?, mtime = ? WHERE filename = ?')
const psUpdateHash = db.prepare('UPDATE files SET hash = ? WHERE filename = ?')
const psExists = db.prepare('SELECT mtime FROM files WHERE filename = ?')
const psCheckSize = db.prepare('SELECT filename, hash FROM files WHERE size = ?')
const psCheckHash = db.prepare('SELECT filename FROM files WHERE size = ? AND hash = ?')
const psGetHash = db.prepare('SELECT hash FROM files WHERE filename = ?')
const psGetTarget = db.prepare('SELECT * FROM targets')
const psInsertTarget = db.prepare('INSERT OR IGNORE INTO targets VALUES(?)')
if (options.target === undefined && psGetTarget.all().length === 0) {
console.log(getUsage(usageSpec))
process.exit(1)
}
if (options.list) {
console.log('[recorded target folders]')
console.log(psGetTarget.all().map(x => x.target).join('\n'))
process.exit(0)
}
function getHash (filename) {
return new Promise((resolve, reject) => {
let sha256 = crypto.createHash('sha256')
let rs = fs.createReadStream(filename)
rs.on('data', data => sha256.update(data))
rs.on('end', () => resolve(sha256.digest('hex')))
})
}
function datestr2ms (datestr)
{
return (new Date(datestr)).valueOf()
}
let crash = {}
function processFile (file, st) {
const exists = psExists.get(file)
const mtime = datestr2ms(st.mtime)
if (exists === undefined || exists.mtime !== mtime) {
const sizeDup = psCheckSize.all(st.size)
if (sizeDup.length !== 0) {
let promises = []
for (let dup of sizeDup) {
promises.push(
dup.hash !== null ? Promise.resolve(dup) :
getHash(dup.filename).then(hash => {
psUpdateHash.run(hash, dup.filename)
return { hash, filename: dup.filename }
})
)
}
return Promise.all(promises).then(hashes => {
return getHash(file).then(hash => {
if (hashes.map(x => x.hash).indexOf(hash) !== -1) { // hash matches
const crashKey = `${st.size}_${hash}`
if (!(crashKey in crash)) crash[crashKey] = {}
for (let k of hashes.filter(x => x.hash === hash).map(x => x.filename)) {
crash[crashKey][k] = true
}
crash[crashKey][file] = true
} else { // new hash
if (exists === undefined) psInsert.run(file, st.size, hash, mtime)
else psUpdate.run(st.size, hash, mtime, file)
}
})
})
} else { // new size
psInsertWoHash.run(file, st.size, mtime)
}
}
return Promise.resolve()
}
function processDir (dir) {
return fs.readdirAsync(dir).then(files => {
let promises = []
for (let file_ of files) {
const file = path.join(dir, file_)
const st = fs.statSync(file)
if(st.isDirectory()) promises.push(processDir(file))
else promises.push(processFile(file, st))
}
return Promise.all(promises)
})
}
let invalids = []
for (let entry of psWalk.iterate()) {
try {
const st = fs.statSync(entry.filename)
if (entry.mtime !== datestr2ms(st.mtime) || entry.size !== st.size) invalids.push(entry.filename)
} catch(e) {
invalids.push(entry.filename)
}
}
for (let invalid of invalids) {
console.log(`invalid: ${invalid}`)
psDelete.run(invalid)
}
if (options.target !== undefined) {
options.target.forEach(t => psInsertTarget.run(t))
}
let promiseTargets = []
for (let target of psGetTarget.all().map(x => x.target)) {
promiseTargets.push(processDir(target))
}
Promise.all(promiseTargets).then(() => {
for (let key in crash) {
const [size, hash] = key.split('_')
console.log(`size: ${size} hash: ${hash}`)
for (let file in crash[key]) {
console.log(` ${file}`)
}
}
})
{
"name": "dupcheck",
"version": "1.0.0",
"description": "File duplication checker",
"main": "dupcheck.js",
"dependencies": {
"better-sqlite3": "^4.1.0",
"bluebird": "^3.5.1",
"command-line-args": "^5.0.2",
"command-line-usage": "^4.1.0",
"fs-extra": "^5.0.0",
"node-gyp": "^3.6.2"
},
"devDependencies": {},
"scripts": {
"test": "mocha test.js"
},
"author": "",
"license": "ISC"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment