Created
April 19, 2018 18:12
-
-
Save genotrance/ef84e2ee7daab54c0a385e92c0a2757d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncdispatch | |
import asynctools | |
import docopt | |
import json | |
import nre | |
import os | |
import ospaths | |
import sequtils | |
import sha256/sha256sum | |
import strutils | |
import tables | |
import threadpool | |
import times | |
# ### | |
# Constants | |
const FINGERPRINT_MATCH_THRESHOLD = 0.92 | |
const FINGERPRINT_MATCH_OFFSET = 80 | |
const FINGERPRINT_RELEVANT_BITS = uint32(0xFFFFFF00) | |
const MAX_THREADS = 4 | |
# ### | |
# Handle CTRL-C | |
proc chandler() {.noconv.} = | |
setupForeignThreadGc() | |
echo "\nExiting" | |
quit(1) | |
setControlCHook(chandler) | |
# ## | |
# Maximum number of threads | |
setMaxPoolSize(MAX_THREADS) | |
# ### | |
# FFprobe CLI | |
#let FFPROBE = ["-hide_banner", "-of", "json", "-v", "quiet", "-show_format", "-show_entries", "format=filename,duration:format_tags=title,artist,album_artist,composer,album,Acoustid Id,MusicBrainz Release Track Id"] | |
let FFFORMAT = [".mp3", ".m4a", ".ogg", ".flac"] | |
# ### | |
# Command line arguments | |
const DOC = """ | |
Automatic duplicate file finder | |
Usage: | |
autodup [options] <sourcedir> [<dupdir>] | |
Options: | |
-h --help | |
Search | |
-D Search for duplicate files | |
-E Search for empty directories | |
-M Search for duplicate music files (requires fpcalc) | |
Filters | |
-f Include files only | |
-d Include directories only | |
-p <pattern> Include files / directories containing pattern | |
-P <regex> Include files / directories containing regex | |
-s <fsize> Include size greater than (in bytes) | |
-S <fsize> Include size lesser than (in bytes) | |
-t <time> Include last modified after (in days) | |
-T <time> Include last modified before (in days) | |
Actions | |
-m Move search results | |
-x Delete search results | |
-q Quiet - don't display results | |
""" | |
var ARGS {.threadvar.}: Table[string, Value] | |
ARGS = docopt(DOC) | |
proc getintflag(flag: string): int = | |
try: | |
return parseInt($ARGS[flag]) | |
except: | |
echo "Bad integer input for " & flag & ": " & $ARGS[flag] | |
quit(1) | |
# Flags | |
var SOURCEDIR = "." | |
if $ARGS["<sourcedir>"] != "nil": | |
SOURCEDIR = $ARGS["<sourcedir>"] | |
var DUPDIR {.threadvar.}: string | |
DUPDIR = "duplicates" | |
if $ARGS["<dupdir>"] != "nil": | |
DUPDIR = $ARGS["<dupdir>"] | |
var PATTERN = "" | |
if $ARGS["-p"] != "nil": | |
PATTERN = $ARGS["-p"] | |
var REGEX = "" | |
if $ARGS["-P"] != "nil": | |
REGEX = $ARGS["-P"] | |
var FMINSIZE = 0 | |
if $ARGS["-s"] != "nil": | |
FMINSIZE = getintflag("-s") | |
var FMAXSIZE = 0 | |
if $ARGS["-S"] != "nil": | |
FMAXSIZE = getintflag("-S") | |
var TIMEAFTER = 0 | |
if $ARGS["-t"] != "nil": | |
TIMEAFTER = getintflag("-t") | |
var TIMEBEFORE = 0 | |
if $ARGS["-T"] != "nil": | |
TIMEBEFORE = getintflag("-T") | |
var FILES_ONLY = false | |
if ARGS["-D"] or ARGS["-M"] or ARGS["-f"]: | |
FILES_ONLY = true | |
var DIRS_ONLY = false | |
if ARGS["-E"] or ARGS["-d"]: | |
DIRS_ONLY = true | |
var ADD: BiggestInt = 0 | |
var MATCH_COUNT = 0 | |
var FILE_COUNT = 0 | |
var DIR_COUNT = 0 | |
# ### | |
# Tables | |
type | |
FileSize = object | |
first: int | |
hashes: TableRef[string, int] | |
# [fileindex: filename] | |
var FILES: seq[string] = @[] | |
# {filesize: FileSize object} | |
var SIZES: TableRef[BiggestInt, FileSize] = newTable[BiggestInt, FileSize]() | |
# [fileindex: [aidx1, aidx2...]] | |
var SONGS {.threadvar.}: TableRef[int, seq[uint32]] | |
# {aidx: [fileindex1, fileindex2]} | |
var AIDX {.threadvar.}: TableRef[uint32, seq[int]] | |
SONGS = newTable[int, seq[uint32]]() | |
AIDX = newTable[uint32, seq[int]]() | |
# ### | |
# Actions | |
proc moveaction(file, dupdir: string) = | |
var dest = dupdir & DirSep & tailDir(file) | |
try: | |
createDir(parentDir(dest)) | |
moveFile(file, dest) | |
except: | |
echo "Already exists " & dest | |
proc removeaction(file: string, info: FileInfo) = | |
if info.kind == pcFile: | |
if not tryRemoveFile(file): | |
echo "Failed to remove " & file | |
elif info.kind == pcDir: | |
try: | |
removeDir(file) | |
except: | |
echo "Failed to remove dir " & file | |
proc action(file: string, info: FileInfo, orig = "") = | |
if ARGS["-m"]: | |
spawn moveaction(file, DUPDIR) | |
stdout.write("Moving ") | |
elif ARGS["-x"]: | |
spawn removeaction(file, info) | |
stdout.write("Removing ") | |
ADD += info.size | |
MATCH_COUNT += 1 | |
if not ARGS["-q"]: | |
echo file | |
if orig != "": | |
echo " == " & orig | |
# ### | |
# Helpers | |
proc gethash(file: string): Future[string] {.async.} = | |
let hash = spawn sha256sum(file) | |
while not hash.isReady(): | |
await sleepAsync(5) | |
return ^hash | |
# ### | |
# Search | |
proc finddup(idx: int, info: FileInfo) {.async.} = | |
if SIZES.has_key(info.size): | |
# Size seen before | |
let hash = await gethash(FILES[idx]) | |
if SIZES[info.size].hashes != nil: | |
# Hashes initialized | |
if SIZES[info.size].hashes.has_key(hash): | |
# Current hash seen before | |
action(FILES[idx], info, FILES[SIZES[info.size].hashes[hash]]) | |
else: | |
# Unique hash | |
SIZES[info.size].hashes[hash] = idx | |
else: | |
# Hashes not initialized | |
let fhash = await gethash(FILES[SIZES[info.size].first]) | |
SIZES[info.size].hashes = newTable[string, int]() | |
SIZES[info.size].hashes[fhash] = SIZES[info.size].first | |
if fhash == hash: | |
# Current hash same as first hash for size | |
action(FILES[idx], info, FILES[SIZES[info.size].first]) | |
else: | |
# Unique hash | |
SIZES[info.size].hashes[hash] = idx | |
else: | |
# First file size | |
SIZES[info.size] = FileSize(first: idx, hashes: nil) | |
{.compile: "pg_acoustid/acoustid_compare.c".} | |
#~ proc match_fingerprints(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint): cfloat {.importc, cdecl, gcsafe.} | |
#~ proc match_fingerprints2(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.} | |
proc match_fingerprints3(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.} | |
proc fpcalc(file: string): Future[JsonNode] {.async, inline.} = | |
let args = @["-json", "-raw", file] | |
let data = await execProcess("fpcalc.exe", args=args, options={poUsePath}) | |
var jdata: JsonNode | |
try: | |
jdata = parseJson(data.output) | |
except: | |
echo "Bad fingerprint: $#" % file | |
return nil | |
return jdata | |
proc acoustid_compare(idx, id: int): Future[float] {.async.} = | |
var filedata = createSharedU(uint32, SONGS[idx].len()) | |
var fdata = createSharedU(uint32, SONGS[id].len()) | |
filedata.copyMem(addr SONGS[idx][0], SONGS[idx].len() * sizeof(uint32)) | |
fdata.copyMem(addr SONGS[id][0], SONGS[id].len() * sizeof(uint32)) | |
let match = spawn match_fingerprints3(filedata, cint(SONGS[idx].len()), fdata, cint(SONGS[id].len()), FINGERPRINT_MATCH_OFFSET) | |
while not match.isReady(): | |
await sleepAsync(5) | |
filedata.freeShared() | |
fdata.freeShared() | |
return ^match | |
proc findmusicdup(idx: int, info: FileInfo) {.async.} = | |
let jdata = await fpcalc(FILES[idx]) | |
if jdata == nil: | |
return | |
SONGS[idx] = @[] | |
for i in jdata{"fingerprint"}.items: | |
let aidx: uint32 = uint32(i.getNum()) and FINGERPRINT_RELEVANT_BITS | |
SONGS[idx].add(aidx) | |
var compare: seq[int] = @[] | |
for aidx in SONGS[idx].deduplicate(): | |
if AIDX.has_key(aidx): | |
for id in AIDX[aidx]: | |
# Don't compare same two files multiple times | |
if not compare.contains(id): | |
let match = await acoustid_compare(idx, id) | |
if match > FINGERPRINT_MATCH_THRESHOLD: | |
action(FILES[idx], info, FILES[id]) | |
SONGS.del(idx) | |
return | |
else: | |
compare.add(id) | |
# Not a duplicate, add to AIDX index for easy comparison | |
for aidx in SONGS[idx].deduplicate(): | |
if AIDX.has_key(aidx): | |
if not AIDX[aidx].contains(idx): | |
AIDX[aidx].add(idx) | |
else: | |
AIDX[aidx] = @[idx] | |
proc findempty(dir: string, info: FileInfo) = | |
var empty = true | |
for sf in walkPattern(dir & DirSep & "*"): | |
empty = false | |
break | |
if empty: | |
action(dir, info) | |
# ### | |
# Scan | |
proc recurse(dir: string) = | |
let now = getTime() | |
let after = initInterval(days=TIMEAFTER) | |
let before = initInterval(days=TIMEBEFORE) | |
for file in walkPattern(dir & DirSep & "*"): | |
var info: FileInfo | |
try: | |
info = getFileInfo(file) | |
except: | |
continue | |
if info.kind == pcFile: | |
FILE_COUNT += 1 | |
# Skip files | |
if DIRS_ONLY: | |
continue | |
elif info.kind == pcDir: | |
DIR_COUNT += 1 | |
recurse(file) | |
# Skip directories | |
if FILES_ONLY: | |
continue | |
# Skip if doesn't match pattern | |
let (_, name, ext) = splitFile(file) | |
if PATTERN != "": | |
if not (name & ext).contains(PATTERN): | |
continue | |
if REGEX != "": | |
if not (name & ext).contains(re(REGEX)): | |
continue | |
# Skip if smaller than | |
if $ARGS["-s"] != "nil": | |
if info.size < FMINSIZE: | |
continue | |
# Skip if larger than | |
if $ARGS["-S"] != "nil": | |
if info.size > FMAXSIZE: | |
continue | |
# Skip if older than | |
if $ARGS["-t"] != "nil": | |
if info.lastWriteTime < now - after: | |
continue | |
# Skip if newer than | |
if $ARGS["-T"] != "nil": | |
if info.lastWriteTime > now - before: | |
continue | |
if info.kind == pcFile: | |
# Don't process file multiple times | |
if not FILES.contains(file): | |
FILES.add(file) | |
if ARGS["-D"]: | |
asyncCheck finddup(FILES.len()-1, info) | |
elif ARGS["-M"]: | |
if file.splitFile().ext.toLowerAscii() in FFFORMAT: | |
asyncCheck findmusicdup(FILES.len()-1, info) | |
else: | |
action(file, info) | |
elif info.kind == pcDir: | |
if ARGS["-E"]: | |
findempty(file, info) | |
else: | |
action(file, info) | |
sync() | |
try: | |
runForever() | |
except: | |
discard | |
if not ARGS["-q"]: | |
stdout.write("$# matches: $# MB / $# dirs, $# files\r" % [$MATCH_COUNT, formatFloat(float(ADD)/1024/1024, ffDecimal, 2), $DIR_COUNT, $FILE_COUNT]) | |
stdout.flushFile() | |
# ### | |
# Main | |
recurse(SOURCEDIR) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment