Last active
February 3, 2018 21:23
-
-
Save righthandabacus/93036ed3a1868181d8349c350f57be13 to your computer and use it in GitHub Desktop.
Remove duplicate files in a directory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import argparse | |
import hashlib | |
import itertools | |
import os | |
import multiprocessing as mp | |
import sys | |
def md5pair(filename): | |
return (filename, hashlib.md5(open(filename, 'rb').read()).hexdigest()) | |
def md5walk(rootdir, num, kidding): | |
''' | |
walk dir for each file, and yield full path filename and its md5 | |
''' | |
for dirname, subdirs, files in os.walk(rootdir): | |
print(dirname) | |
subdirs.sort() # walk dir in dictionary order | |
md5check(dirname, files, num, kidding) | |
def md5check(dirname, files, num, kidding): | |
md5dicts = {} | |
padding = ' '*30 | |
fullpaths = [os.path.join(dirname,f) for f in files if not f.startswith('.')] | |
filecount = len(fullpaths) | |
if num > 1: | |
# multiprocessing, use pool | |
pool = mp.Pool(processes=num) | |
imap = pool.imap | |
else: | |
# single process, use itertools | |
imap = itertools.imap | |
# obtain md5 of all files and save into dict | |
for i,(fullpath,cksum) in enumerate(imap(md5pair, fullpaths)): | |
if i % 3 == 0: | |
print("%d/%d %s%s" % (i+1,filecount, fullpath, padding), end='\r') | |
sys.stdout.flush() | |
if cksum not in md5dicts: | |
md5dicts[cksum] = [] | |
md5dicts[cksum].append(fullpath) | |
# look for duplicates in dict and delete it | |
for cksum,filelist in md5dicts.iteritems(): | |
if len(filelist) <= 1: | |
continue # checksum unique, ignore this | |
print(cksum) | |
for i,f in enumerate(sorted(filelist, key=lambda x:(len(x),x))): | |
# keep only one file, prefer the shortest filename | |
if i==0: | |
action = 'keep' | |
else: | |
action = 'deleted' | |
if not kidding: | |
os.unlink(f) | |
print("\t%s (%s)" % (f,action)) | |
if num > 1: | |
pool.close() | |
pool.join() | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("rootdir") | |
parser.add_argument("-n","--num",type=int,default=4,help="Number of parallel processes to run") | |
parser.add_argument("-k","--kidding",action='store_true',default=False,help="Do not actually delete files") | |
args = parser.parse_args() | |
md5walk(args.rootdir, args.num, args.kidding) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment