Skip to content

Instantly share code, notes, and snippets.

@arsane
Created August 25, 2014 02:36
Show Gist options
  • Save arsane/5aeb2d330abae13e022a to your computer and use it in GitHub Desktop.
Save arsane/5aeb2d330abae13e022a to your computer and use it in GitHub Desktop.
remove files with same content under same folder.
import os, sys
import hashlib
def file_digest(f):
m = hashlib.md5()
m.update(file(f).read())
return m.hexdigest()
def remove_duplicates(dir):
# size, list of files.
sizes = {}
# caculated hashs.
hashs = {}
os.chdir(dir)
for filename in os.listdir("."):
if not os.path.isfile(filename):
continue
#print "check file: %s" % filename
size = os.stat(filename).st_size
if size == 0:
continue
if size not in sizes:
sizes[size] = [filename]
else:
# check each file's hash value.
files = sizes[size]
hash_val = file_digest(filename)
toremove = 0
for f in files:
if f not in hashs:
f_hash_val = file_digest(f)
hashs[f] = f_hash_val
#compare hash value.
if hashs[f] == hash_val:
os.remove(filename)
print "remove file: %s which same as file: %s, size: %d, hash: %s" % (filename, f, size, hash_val)
toremove = 1
break
if toremove == 1:
sizes[size].append(filename)
if __name__ == '__main__':
remove_duplicates(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment