Skip to content

Instantly share code, notes, and snippets.

@MrDrMcCoy
Last active December 11, 2017 23:58
Show Gist options
  • Save MrDrMcCoy/feb265c2d5b340e4a69e2e298399d588 to your computer and use it in GitHub Desktop.
Save MrDrMcCoy/feb265c2d5b340e4a69e2e298399d588 to your computer and use it in GitHub Desktop.
A simple duplicate file finder
#!/usr/bin/python2
def linkdupes(base_dir, loglevel='DEBUG'):
import logging, mimetypes, os, zlib
logging.basicConfig(format='%(asctime)s | %(levelname)s | %(funcName)s | %(message)s')
logger = logging.getLogger('linkdupes')
logger.setLevel(logging.getLevelName(loglevel))
mimetypes.add_type('audio/ape', '.ape')
hashes = {}
for root, _, files in os.walk(base_dir, followlinks=True):
logger.info('Processing directory: ' + root)
for f in files:
current = os.path.join(root, f)
logger.debug('Processing file: ' + f)
mime = str(mimetypes.guess_type(current, strict=False))
try:
if 'audio' in mime and not os.path.islink(current):
stdin, stdout, stderr = os.popen3(
'ffmpeg -t 10 -i "' + current + '" -map_metadata -1 -map 0:a:0 -f u8 pipe:', 'rb'
)
filehash = zlib.adler32(stdout.read())
logger.debug('ffmpeg hash: ' + str(filehash))
elif os.path.islink(current):
logger.debug('Skipping link: ' + current)
else:
with open(current) as o:
filehash = zlib.adler32(o.read())
logger.debug('Plain file hash: ' + str(filehash))
if filehash not in hashes:
hashes[filehash] = current
else:
logger.info('Found duplicate file: ' + current)
try:
os.remove(current)
logger.debug('Removed dupe.')
#os.symlink(hashes[filehash], current)
#logger.debug('Replaced dupe with symlink to: ' + hashes[filehash])
except Exception as ex:
logger.exception('Unable to replace file with symlink.')
return
except Exception as ex:
logger.exception('Error processing file: ' + current)
linkdupes('/media/music', 'DEBUG')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment