Skip to content

Instantly share code, notes, and snippets.

@AGulev
Last active April 26, 2024 02:32
Show Gist options
  • Save AGulev/d5dc12127e0fbe1cd4f239effb76cd81 to your computer and use it in GitHub Desktop.
Save AGulev/d5dc12127e0fbe1cd4f239effb76cd81 to your computer and use it in GitHub Desktop.
Python script for a searching duplicate files in folder. Modification of: https://www.pythoncentral.io/finding-duplicate-files-with-python/ (file size counter was added)
# dupFinder.py
import os, sys, stat
import hashlib
def findDup(parentFolder):
# Dups in format {hash:[names]}
dups = {}
for dirName, subdirs, fileList in os.walk(parentFolder):
print('Scanning %s...' % dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Calculate hash
file_hash = hashfile(path)
# Add or append the file path
if file_hash in dups:
dups[file_hash]['path'].append(path)
else:
dups[file_hash] = {}
dups[file_hash]['path'] = [path]
if os.path.isfile(path):
dups[file_hash]['size'] = os.stat(path)[stat.ST_SIZE]
return dups
# Joins two dictionaries
def joinDicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key]['path'] += dict2[key]['path']
else:
dict1[key] = dict2[key]
def hashfile(path, blocksize = 65536):
afile = open(path, 'rb')
hasher = hashlib.md5()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
afile.close()
return hasher.hexdigest()
def sizeof_fmt(num, suffix):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
def printResults(dict1):
results = list(filter(lambda x: len(x['path']) > 1, dict1.values()))
summ = 0
if len(results) > 0:
results = sorted(results, key=lambda res: res['size'])
print('Duplicates:')
print('___________________')
for result in results:
coef = len(result['path']) - 1
summ += result['size'] * coef
print('\t%s' % sizeof_fmt(result['size'], 'B'))
for subresult in result['path']:
print('\t\t%s' % subresult)
print('___________________')
print('TOTAL SIZE %s' % sizeof_fmt(summ, 'B'))
else:
print('No duplicate files found.')
if __name__ == '__main__':
if len(sys.argv) > 1:
dups = {}
folders = sys.argv[1:]
for i in folders:
# Iterate the folders given
if os.path.exists(i):
# Find the duplicated files and append them to the dups
joinDicts(dups, findDup(i))
else:
print('%s is not a valid path, please verify' % i)
sys.exit()
printResults(dups)
else:
print('Usage: python duplicates_finder.py folder or python duplicates_finder.py folder1 folder2 folder3')
@a-1983246
Copy link

a-1983246 commented Oct 9, 2020

I am getting this error

Scanning folder1...
Scanning folder2...
Traceback (most recent call last):
  File "duplicates_finder.py", line 80, in <module>
    joinDicts(dups, findDup(i))
  File "duplicates_finder.py", line 30, in joinDicts
    dict1[key] = dict1[key] + dict2[key]
TypeError: unsupported operand type(s) for +: 'dict' and 'dict'

@ramakrishnaraov
Copy link

The following update is working:

Joins two dictionaries

def joinDicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
#dict1[key] = dict1[key] + dict2[key]
dict1[key]['path'] += dict2[key]['path']
else:
dict1[key] = dict2[key]

@AGulev
Copy link
Author

AGulev commented Jul 19, 2021

@ramakrishnaraov Thank you, I fixed it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment