Skip to content

Instantly share code, notes, and snippets.

@yamatt
Created June 30, 2015 10:56
Show Gist options
  • Save yamatt/860217bc117945d8af22 to your computer and use it in GitHub Desktop.
Save yamatt/860217bc117945d8af22 to your computer and use it in GitHub Desktop.
Use this code to find duplicate files in a directory (GPLv3+)
#!/usr/bin/env python
import os
import hashlib
import collections
import argparse
class File(object):
def __init__(self, filepath):
self.filepath = filepath
self._filesize = None
self._md5 = None
@property
def filesize(self):
if self._filesize is None:
self._filesize = os.path.getsize(self.filepath)
return self._filesize
@property
def content(self):
return open(self.filepath, 'rb').read()
@property
def md5(self):
if self._md5 is None:
self._md5 = hashlib.md5(self.content).hexdigest()
return self._md5
def __cmp__(self, other):
if other.filesize == self.filesize:
if other.md5 == self.md5:
return 0
return 1
def __str__(self):
return self.filepath
class DeDupe(object):
def __init__(self, path):
self.path = path
def walk(self):
file_list = []
for root, dirs, files in os.walk(self.path):
for name in files:
file = File(os.path.join(root, name))
file_list.append(file)
return file_list
@staticmethod
def find_duplicates(file_list):
seen = []
duplicates = {}
for file in file_list:
for seen_file in seen:
if seen_file == file:
if file.md5 in duplicates:
duplicates[file.md5].append(file)
else:
duplicates[file.md5] = [
file,
seen_file
]
seen.append(file)
return duplicates
def print_list(l):
print "---"
for values in l.values():
for file in values:
print file.filepath
print "---"
def get_args():
parser = argparse.ArgumentParser(description='File duplicate files.')
parser.add_argument('dir', nargs='?', default=os.getcwd())
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
source_path = os.path.abspath(args.dir)
dedupe = DeDupe(source_path)
files = dedupe.walk()
print("Found {n} files.".format(n=len(files)))
duplicates = dedupe.find_duplicates(files)
print("Found {n} duplicates.".format(n=len(duplicates)))
print_list(duplicates)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment