Created
February 9, 2015 03:27
-
-
Save tazdij/d48d2430e3eccf09345a to your computer and use it in GitHub Desktop.
A simple python script to detect and remove duplicate files a one or more locations, seperated by a semicolon.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Written by Don Duvall | |
# Licensed under MIT | |
# Date 2015-02-08 | |
# Website: http://deduvall.com/ & http://donaldduvall.com | |
# Email: [email protected] | |
# | |
import os | |
import hashlib | |
class DupScout: | |
# Index by MD5 of File -> list of filepath | |
fileTree = dict() | |
# Index by Directory -> list of MD5 | |
dirIdx = dict() | |
# Index by Filepath -> MD5 of File | |
fileMD5 = dict() | |
def generateHash(self, filename, blocksize=2**20): | |
# Generate and return the hash of the file | |
m = hashlib.md5() | |
with open(filename, "rb") as f: | |
while True: | |
buf = f.read(blocksize) | |
if not buf: | |
break | |
m.update(buf) | |
return m.hexdigest() | |
def main(self, paths): | |
# Loop each path to search | |
for filepath in paths: | |
# Loop files and insert into fileTree | |
for root, dirs, files in os.walk(filepath, topdown=False): | |
# Print objects to parse | |
print('Scanning: ' + root) | |
count = len(files) | |
print('# of files: ' + str(count)) | |
i = 0 | |
for name in files: | |
i += 1 | |
# Generate MD5 Hash of the files | |
filename = os.path.join(root, name) | |
fhash = self.generateHash(filename) | |
#print(filename + ': ' + fhash) | |
if not self.fileTree.has_key(fhash): | |
self.fileTree[fhash] = list() | |
self.fileTree[fhash].append(filename) | |
if not (i % 20): | |
print('# ' + str(i) + ' of ' + str(count)) | |
# Write the tree to a file | |
f = open('log.txt', 'w+') | |
for item in self.fileTree.items(): | |
if len(item[1]) > 1: | |
f.write(item[0] + '\t' + str(item[1]) + '\n') | |
print('Found ' + str(len(item[1])) + ' locations. Please select a location # to keep.') | |
# Prompt user to select the file to keep | |
locations = item[1] | |
i = 0; | |
locs = dict() | |
for location in locations: | |
locs[str(i)] = location | |
print(str(i) + '. ' + location) | |
i += 1 | |
idx = raw_input("What location should be kept? ") | |
f.write('Keeping Location: ' + locs[idx] + '\n') | |
# Delete all other locations | |
for loc in locs.items(): | |
if not loc[0] == idx: | |
# Delete the location | |
os.unlink(loc[1]) | |
f.write('Deleting Location: ' + loc[1] + '\n') | |
f.close() | |
print('Finished, log.txt contains the results.') | |
if __name__ == '__main__': | |
dupscout = DupScout() | |
paths = raw_input('Enter the path to scan for duplicates: ') | |
paths = paths.split(';') | |
dupscout.main(paths) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment