Skip to content

Instantly share code, notes, and snippets.

@tazdij
Created February 9, 2015 03:27
Show Gist options
  • Save tazdij/d48d2430e3eccf09345a to your computer and use it in GitHub Desktop.
Save tazdij/d48d2430e3eccf09345a to your computer and use it in GitHub Desktop.
A simple python script to detect and remove duplicate files a one or more locations, seperated by a semicolon.
# Written by Don Duvall
# Licensed under MIT
# Date 2015-02-08
# Website: http://deduvall.com/ & http://donaldduvall.com
# Email: [email protected]
#
import os
import hashlib
class DupScout:
# Index by MD5 of File -> list of filepath
fileTree = dict()
# Index by Directory -> list of MD5
dirIdx = dict()
# Index by Filepath -> MD5 of File
fileMD5 = dict()
def generateHash(self, filename, blocksize=2**20):
# Generate and return the hash of the file
m = hashlib.md5()
with open(filename, "rb") as f:
while True:
buf = f.read(blocksize)
if not buf:
break
m.update(buf)
return m.hexdigest()
def main(self, paths):
# Loop each path to search
for filepath in paths:
# Loop files and insert into fileTree
for root, dirs, files in os.walk(filepath, topdown=False):
# Print objects to parse
print('Scanning: ' + root)
count = len(files)
print('# of files: ' + str(count))
i = 0
for name in files:
i += 1
# Generate MD5 Hash of the files
filename = os.path.join(root, name)
fhash = self.generateHash(filename)
#print(filename + ': ' + fhash)
if not self.fileTree.has_key(fhash):
self.fileTree[fhash] = list()
self.fileTree[fhash].append(filename)
if not (i % 20):
print('# ' + str(i) + ' of ' + str(count))
# Write the tree to a file
f = open('log.txt', 'w+')
for item in self.fileTree.items():
if len(item[1]) > 1:
f.write(item[0] + '\t' + str(item[1]) + '\n')
print('Found ' + str(len(item[1])) + ' locations. Please select a location # to keep.')
# Prompt user to select the file to keep
locations = item[1]
i = 0;
locs = dict()
for location in locations:
locs[str(i)] = location
print(str(i) + '. ' + location)
i += 1
idx = raw_input("What location should be kept? ")
f.write('Keeping Location: ' + locs[idx] + '\n')
# Delete all other locations
for loc in locs.items():
if not loc[0] == idx:
# Delete the location
os.unlink(loc[1])
f.write('Deleting Location: ' + loc[1] + '\n')
f.close()
print('Finished, log.txt contains the results.')
if __name__ == '__main__':
dupscout = DupScout()
paths = raw_input('Enter the path to scan for duplicates: ')
paths = paths.split(';')
dupscout.main(paths)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment