Created
February 27, 2013 10:39
-
-
Save yngwie74/5047007 to your computer and use it in GitHub Desktop.
Find duplicate files within a file system sub-tree. It demonstrates a simple way for combining Functional and Object-Oriented Programming techniques -- no religious wars! -- using generator expressions, comprehensions and decorators, among others.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os | |
from os import path | |
from itertools import imap, chain | |
import fnmatch | |
# Set the base path to scan for duplicates here | |
base = r'/media/file-rep/files' | |
def must_exist(f): | |
def check(self, *args, **kwds): | |
if not self.exists: | |
raise ValueError('Path to directory %r does not exist.' % self.path) | |
return f(self, *args, **kwds) | |
return check | |
class Dir(object): | |
'''Encapsulates directory and sub-tree scanning logic''' | |
def __init__(self, dirpath): | |
self.path = path.abspath(dirpath) | |
@property | |
def exists(self): | |
return path.exists(self.path) | |
def _get_full_path_of(self, f): | |
return path.normcase(path.join(self.path, f)) | |
def _get_entries(self, of_type, filter): | |
iter = (f for f in os.listdir(self.path) if of_type(self._get_full_path_of(f))) | |
if filter: | |
iter = (f for f in iter if fnmatch.fnmatch(f, filter)) | |
return iter | |
@must_exist | |
def get_sub_dirs(self, filter=None): | |
iter = self._get_entries(path.isdir, filter) | |
return (Dir(self._get_full_path_of(f)) for f in iter) | |
@must_exist | |
def get_files(self, filter=None): | |
iter = self._get_entries(path.isfile, filter) | |
return imap(self._get_full_path_of, iter) | |
def __str__(self): | |
return self.path | |
#~ end class Dir | |
def find_depth(top, file_spec): | |
dir = Dir(top) | |
local = dir.get_files(file_spec) | |
recursive = (file for subdir in dir.get_sub_dirs() | |
for file in find_depth(subdir.path, file_spec)) | |
return chain(local, recursive) | |
def find_duplicates_of(file_path): | |
file_name = path.basename(file_path) | |
return [r for r in sorted(find_depth(base, file_name)) if f != r] | |
def print_duplicates_of(file_name, duplicates): | |
file_list = '\r\t'.join([file_name] + duplicates) | |
sys.stderr.write('\n%s\n' % file_list) | |
if __name__ == '__main__': | |
import sys | |
for cur_file in Dir('.').get_files('_*.*'): | |
sys.stdout.write('.') | |
duplicates = find_duplicates_of(cur_file) | |
if duplicates: | |
print_duplicates_of(cur_file, duplicates) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I created this little script to aid the maintenance of a media file collection. Files were ordered in directories by category and each directory has a MD5SUMS digest file, so checking for true duplicates vs name clashes was trivial. However, I decided to leave the digest-related code out of the gist to keep things simple and focused.