Last active
March 6, 2018 11:41
-
-
Save yak1ex/c90aed92a920f043e8a388871a0233c5 to your computer and use it in GitHub Desktop.
dupchecker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| node_modules/ | |
| *.BAK |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 'use strict' | |
| // TODO: error handling | |
| const Promise = require('bluebird') | |
| const fs = Promise.promisifyAll(require('fs-extra')) | |
| const path = require('path') | |
| const Database = require('better-sqlite3') | |
| const crypto = require('crypto') | |
| const commandLineArgs = require('command-line-args') | |
| const getUsage = require('command-line-usage') | |
| const usageSpec = [ | |
| { | |
| header: 'dupcheck.js', | |
| content: 'File duplication checker' | |
| }, | |
| { | |
| header: 'Synopsis', | |
| content: [ | |
| { colA: '$', colB: 'dupcheck.js --db <dbfile> <target_dirs>...' }, | |
| { colA: '$', colB: 'dupcheck.js --db <dbfile> --target <target_dir>...' }, | |
| { colA: '$', colB: 'dupcheck.js --db <dbfile> --target <target_dir> --target <target_dir>...' }, | |
| { colA: '$', colB: 'dupcheck.js --db <dbfile>' }, | |
| { colA: '$', colB: 'dupcheck.js --db <dbfile> --list' }, | |
| { colA: '$', colB: 'dupcheck.js --help' } | |
| ] | |
| }, | |
| { | |
| header: 'Options', | |
| optionList: [ | |
| { name: 'db', alias: 'd', type: String, typeLabel: 'dbfile', description: 'A hash db filename' }, | |
| { name: 'target', alias: 't', type: String, typeLabel: 'target_dir', multiple: true, defaultOption: true, description: 'Target folders' }, | |
| { name: 'list', alias: 'l', type: Boolean, description: 'Show recorded target folders' }, | |
| { name: 'help', alias: 'h', type: Boolean, description: 'Show help' } | |
| ] | |
| }, | |
| { | |
| header: 'Notes', | |
| content: [ | |
| 'Specified target folders are recorded in db.', | |
| 'You can omit target options if they are already recorded.' | |
| ] | |
| } | |
| ] | |
| const options = commandLineArgs(usageSpec[2].optionList) | |
| if (options.help || options.db === undefined) { | |
| console.log(getUsage(usageSpec)) | |
| process.exit(1) | |
| } | |
| let db = new Database(options.db) | |
| if(db.prepare('SELECT COUNT(*) FROM sqlite_master WHERE tbl_name = "files"').get()['COUNT(*)'] === 0) { // init | |
| db.exec('CREATE TABLE files (filename TEXT PRIMARY KEY, size INTEGER, hash TEXT, mtime INTEGER)') | |
| db.exec('CREATE INDEX files_idx on files (size, hash)') | |
| db.exec('CREATE TABLE targets (target TEXT PRIMARY KEY)') | |
| } | |
| const psWalk = db.prepare('SELECT * FROM files') | |
| const psDelete = db.prepare('DELETE FROM files WHERE filename = ?') | |
| const psInsert = db.prepare('INSERT INTO files VALUES(?,?,?,?)') | |
| const psInsertWoHash = db.prepare('INSERT INTO files (filename, size, mtime) VALUES(?,?,?)') | |
| const psUpdate = db.prepare('UPDATE files SET size = ?, hash = ?, mtime = ? WHERE filename = ?') | |
| const psUpdateHash = db.prepare('UPDATE files SET hash = ? WHERE filename = ?') | |
| const psExists = db.prepare('SELECT mtime FROM files WHERE filename = ?') | |
| const psCheckSize = db.prepare('SELECT filename, hash FROM files WHERE size = ?') | |
| const psCheckHash = db.prepare('SELECT filename FROM files WHERE size = ? AND hash = ?') | |
| const psGetHash = db.prepare('SELECT hash FROM files WHERE filename = ?') | |
| const psGetTarget = db.prepare('SELECT * FROM targets') | |
| const psInsertTarget = db.prepare('INSERT OR IGNORE INTO targets VALUES(?)') | |
| if (options.target === undefined && psGetTarget.all().length === 0) { | |
| console.log(getUsage(usageSpec)) | |
| process.exit(1) | |
| } | |
| if (options.list) { | |
| console.log('[recorded target folders]') | |
| console.log(psGetTarget.all().map(x => x.target).join('\n')) | |
| process.exit(0) | |
| } | |
| function getHash (filename) { | |
| return new Promise((resolve, reject) => { | |
| let sha256 = crypto.createHash('sha256') | |
| let rs = fs.createReadStream(filename) | |
| rs.on('data', data => sha256.update(data)) | |
| rs.on('end', () => resolve(sha256.digest('hex'))) | |
| }) | |
| } | |
| function datestr2ms (datestr) | |
| { | |
| return (new Date(datestr)).valueOf() | |
| } | |
| let crash = {} | |
| function processFile (file, st) { | |
| const exists = psExists.get(file) | |
| const mtime = datestr2ms(st.mtime) | |
| if (exists === undefined || exists.mtime !== mtime) { | |
| const sizeDup = psCheckSize.all(st.size) | |
| if (sizeDup.length !== 0) { | |
| let promises = [] | |
| for (let dup of sizeDup) { | |
| promises.push( | |
| dup.hash !== null ? Promise.resolve(dup) : | |
| getHash(dup.filename).then(hash => { | |
| psUpdateHash.run(hash, dup.filename) | |
| return { hash, filename: dup.filename } | |
| }) | |
| ) | |
| } | |
| return Promise.all(promises).then(hashes => { | |
| return getHash(file).then(hash => { | |
| if (hashes.map(x => x.hash).indexOf(hash) !== -1) { // hash matches | |
| const crashKey = `${st.size}_${hash}` | |
| if (!(crashKey in crash)) crash[crashKey] = {} | |
| for (let k of hashes.filter(x => x.hash === hash).map(x => x.filename)) { | |
| crash[crashKey][k] = true | |
| } | |
| crash[crashKey][file] = true | |
| } else { // new hash | |
| if (exists === undefined) psInsert.run(file, st.size, hash, mtime) | |
| else psUpdate.run(st.size, hash, mtime, file) | |
| } | |
| }) | |
| }) | |
| } else { // new size | |
| psInsertWoHash.run(file, st.size, mtime) | |
| } | |
| } | |
| return Promise.resolve() | |
| } | |
| function processDir (dir) { | |
| return fs.readdirAsync(dir).then(files => { | |
| let promises = [] | |
| for (let file_ of files) { | |
| const file = path.join(dir, file_) | |
| const st = fs.statSync(file) | |
| if(st.isDirectory()) promises.push(processDir(file)) | |
| else promises.push(processFile(file, st)) | |
| } | |
| return Promise.all(promises) | |
| }) | |
| } | |
| let invalids = [] | |
| for (let entry of psWalk.iterate()) { | |
| try { | |
| const st = fs.statSync(entry.filename) | |
| if (entry.mtime !== datestr2ms(st.mtime) || entry.size !== st.size) invalids.push(entry.filename) | |
| } catch(e) { | |
| invalids.push(entry.filename) | |
| } | |
| } | |
| for (let invalid of invalids) { | |
| console.log(`invalid: ${invalid}`) | |
| psDelete.run(invalid) | |
| } | |
| if (options.target !== undefined) { | |
| options.target.forEach(t => psInsertTarget.run(t)) | |
| } | |
| let promiseTargets = [] | |
| for (let target of psGetTarget.all().map(x => x.target)) { | |
| promiseTargets.push(processDir(target)) | |
| } | |
| Promise.all(promiseTargets).then(() => { | |
| for (let key in crash) { | |
| const [size, hash] = key.split('_') | |
| console.log(`size: ${size} hash: ${hash}`) | |
| for (let file in crash[key]) { | |
| console.log(` ${file}`) | |
| } | |
| } | |
| }) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "name": "dupcheck", | |
| "version": "1.0.0", | |
| "description": "File duplication checker", | |
| "main": "dupcheck.js", | |
| "dependencies": { | |
| "better-sqlite3": "^4.1.0", | |
| "bluebird": "^3.5.1", | |
| "command-line-args": "^5.0.2", | |
| "command-line-usage": "^4.1.0", | |
| "fs-extra": "^5.0.0", | |
| "node-gyp": "^3.6.2" | |
| }, | |
| "devDependencies": {}, | |
| "scripts": { | |
| "test": "mocha test.js" | |
| }, | |
| "author": "", | |
| "license": "ISC" | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment