Skip to content

Instantly share code, notes, and snippets.

@abshkd
Last active July 29, 2019 15:52
Show Gist options
  • Save abshkd/5683516 to your computer and use it in GitHub Desktop.
Save abshkd/5683516 to your computer and use it in GitHub Desktop.
http://www.webmastersupport.com a simple script to find duplicate files (even if they are renamed) in a given location with python.
#!/usr/bin/env python
# A simple python script to list duplicate files for a given path
# This is useful to find music, videos, pictures etc that may have different names or locations but same content.
# Usage: $python find_dupes.py <windows or unix path>
# http://www.webmastersupport.com
__author__ = 'abhishek.dujari'
import os
import sys
import hashlib
def traverse(path):
digest = {}
duplicates = {}
rootdir = path
count = 0
for root, subFolders, files in os.walk(rootdir):
#print root
for folder in subFolders:
#print "folder %s" % folder
pass
for filename in files:
count = count+1
sys.stdout.write("%s \r" % (count))
sys.stdout.flush()
filename = os.path.join(root,filename)
try:
filesum = hashlib.sha256(open(filename,'rb').read()).digest()
if filesum in digest.values():
for file2name, hexdigest in digest.iteritems():
if hexdigest == filesum:
duplicates[filename] = file2name
else:
digest[filename] = filesum
except:
pass
return duplicates
if __name__ == '__main__':
dupes = traverse(sys.argv[1])
for k,v in dupes.iteritems():
print "%s (duplicates) %s" %(k,v)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment