Skip to content

Instantly share code, notes, and snippets.

@righthandabacus
Last active February 3, 2018 21:23
Show Gist options
  • Save righthandabacus/93036ed3a1868181d8349c350f57be13 to your computer and use it in GitHub Desktop.
Save righthandabacus/93036ed3a1868181d8349c350f57be13 to your computer and use it in GitHub Desktop.
Remove duplicate files in a directory
#!/usr/bin/env python
from __future__ import print_function
import argparse
import hashlib
import itertools
import os
import multiprocessing as mp
import sys
def md5pair(filename):
return (filename, hashlib.md5(open(filename, 'rb').read()).hexdigest())
def md5walk(rootdir, num, kidding):
'''
walk dir for each file, and yield full path filename and its md5
'''
for dirname, subdirs, files in os.walk(rootdir):
print(dirname)
subdirs.sort() # walk dir in dictionary order
md5check(dirname, files, num, kidding)
def md5check(dirname, files, num, kidding):
md5dicts = {}
padding = ' '*30
fullpaths = [os.path.join(dirname,f) for f in files if not f.startswith('.')]
filecount = len(fullpaths)
if num > 1:
# multiprocessing, use pool
pool = mp.Pool(processes=num)
imap = pool.imap
else:
# single process, use itertools
imap = itertools.imap
# obtain md5 of all files and save into dict
for i,(fullpath,cksum) in enumerate(imap(md5pair, fullpaths)):
if i % 3 == 0:
print("%d/%d %s%s" % (i+1,filecount, fullpath, padding), end='\r')
sys.stdout.flush()
if cksum not in md5dicts:
md5dicts[cksum] = []
md5dicts[cksum].append(fullpath)
# look for duplicates in dict and delete it
for cksum,filelist in md5dicts.iteritems():
if len(filelist) <= 1:
continue # checksum unique, ignore this
print(cksum)
for i,f in enumerate(sorted(filelist, key=lambda x:(len(x),x))):
# keep only one file, prefer the shortest filename
if i==0:
action = 'keep'
else:
action = 'deleted'
if not kidding:
os.unlink(f)
print("\t%s (%s)" % (f,action))
if num > 1:
pool.close()
pool.join()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("rootdir")
parser.add_argument("-n","--num",type=int,default=4,help="Number of parallel processes to run")
parser.add_argument("-k","--kidding",action='store_true',default=False,help="Do not actually delete files")
args = parser.parse_args()
md5walk(args.rootdir, args.num, args.kidding)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment