Created
June 30, 2015 10:56
-
-
Save yamatt/860217bc117945d8af22 to your computer and use it in GitHub Desktop.
Use this code to find duplicate files in a directory (GPLv3+)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import hashlib | |
import collections | |
import argparse | |
class File(object): | |
def __init__(self, filepath): | |
self.filepath = filepath | |
self._filesize = None | |
self._md5 = None | |
@property | |
def filesize(self): | |
if self._filesize is None: | |
self._filesize = os.path.getsize(self.filepath) | |
return self._filesize | |
@property | |
def content(self): | |
return open(self.filepath, 'rb').read() | |
@property | |
def md5(self): | |
if self._md5 is None: | |
self._md5 = hashlib.md5(self.content).hexdigest() | |
return self._md5 | |
def __cmp__(self, other): | |
if other.filesize == self.filesize: | |
if other.md5 == self.md5: | |
return 0 | |
return 1 | |
def __str__(self): | |
return self.filepath | |
class DeDupe(object): | |
def __init__(self, path): | |
self.path = path | |
def walk(self): | |
file_list = [] | |
for root, dirs, files in os.walk(self.path): | |
for name in files: | |
file = File(os.path.join(root, name)) | |
file_list.append(file) | |
return file_list | |
@staticmethod | |
def find_duplicates(file_list): | |
seen = [] | |
duplicates = {} | |
for file in file_list: | |
for seen_file in seen: | |
if seen_file == file: | |
if file.md5 in duplicates: | |
duplicates[file.md5].append(file) | |
else: | |
duplicates[file.md5] = [ | |
file, | |
seen_file | |
] | |
seen.append(file) | |
return duplicates | |
def print_list(l): | |
print "---" | |
for values in l.values(): | |
for file in values: | |
print file.filepath | |
print "---" | |
def get_args(): | |
parser = argparse.ArgumentParser(description='File duplicate files.') | |
parser.add_argument('dir', nargs='?', default=os.getcwd()) | |
return parser.parse_args() | |
if __name__ == "__main__": | |
args = get_args() | |
source_path = os.path.abspath(args.dir) | |
dedupe = DeDupe(source_path) | |
files = dedupe.walk() | |
print("Found {n} files.".format(n=len(files))) | |
duplicates = dedupe.find_duplicates(files) | |
print("Found {n} duplicates.".format(n=len(duplicates))) | |
print_list(duplicates) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment