Last active
March 20, 2016 01:08
-
-
Save theonlypwner/1ca26a5de900dcb5d51b to your computer and use it in GitHub Desktop.
Duplicate File Detection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Duplicate File Detection | |
__copyright__ = "Copyright (C) 2016 Victor Zheng" | |
__licence__ = "GNU GPL v3" | |
# Based on https://github.com/IanLee1521/utilities/blob/master/utilities/find_duplicates.py | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
import codecs | |
import argparse | |
import hashlib | |
import os | |
import sys | |
def process(dirs): | |
filesBySize = {} | |
print('Finding files...') | |
for dir in dirs: | |
processFolderBySize(dir, filesBySize) | |
print('Detecting duplicates...') | |
for size in sorted(filesBySize.keys(), reverse=True): | |
paths = filesBySize[size] | |
if len(paths) == 1: | |
continue | |
filesByHash = processFilesByHash(paths) | |
# Print results | |
for hash in filesByHash: | |
files = filesByHash[hash] | |
if len(files) == 1: | |
continue | |
print('{} ({} B) found {} times'.format(hash, size, len(files))) | |
for file in files: | |
print(' {}'.format(fixUnicode(file))) | |
def processFolderBySize(rootDir, dict_out): | |
if not os.path.exists(rootDir): | |
print('Invalid path: {}'.format(fixUnicode(rootDir))) | |
return | |
for dir, subdirs, files in os.walk(rootDir, topdown=False): | |
#print('Scanning {}'.format(fixUnicode(dir))) | |
if not files: | |
if not subdirs: | |
print('Empty: {}'.format(fixUnicode(dir))) | |
continue | |
for file in files: | |
path = os.path.join(dir, file) | |
if not os.path.exists(path): | |
print('File disappeared: {}'.format(fixUnicode(path))) | |
continue | |
# Add entry | |
size = os.path.getsize(path) | |
if size in dict_out: | |
dict_out[size].append(path) | |
else: | |
dict_out[size] = [path] | |
def processFilesByHash(paths): | |
hashes = {} | |
for path in paths: | |
hash = hashPath(path) | |
if hash in hashes: | |
hashes[hash].append(path) | |
else: | |
hashes[hash] = [path] | |
return hashes | |
def hashPath(path, blockSize=1048576): | |
hash = hashlib.md5() | |
with open(path, 'rb') as file: | |
buf = 'Python has no do-while loop' | |
while len(buf): | |
buf = file.read(blockSize) | |
hash.update(buf) | |
return hash.hexdigest() | |
def fixUnicode(s): | |
return s.encode('ascii', 'replace').decode('ascii') | |
def main(): | |
parser = argparse.ArgumentParser(description='Find duplicate files') | |
parser.add_argument( | |
'dirs', metavar='dir', type=str, nargs='+', | |
help='Director(y|ies) to check for duplicates', | |
) | |
args = parser.parse_args() | |
process(args.dirs) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment