theonlypwner · March 20, 2016 01:08
diff --git a/dupdetect.py b/dupdetect.py
 #!/usr/bin/env python
 # Duplicate File Detection
 __copyright__ = "Copyright (C) 2016 Victor Zheng"
 __licence__ = "GNU GPL v3"

 # Based on https://github.com/IanLee1521/utilities/blob/master/utilities/find_duplicates.py

 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.

 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.

 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import codecs
 import argparse
 import hashlib
 import os
 import sys

 def process(dirs):
    filesBySize = {}
    print('Finding files...')
    for dir in dirs:
        processFolderBySize(dir, filesBySize)

    print('Detecting duplicates...')
    for size in sorted(filesBySize.keys(), reverse=True):
        paths = filesBySize[size]
        if len(paths) == 1:
            continue

        filesByHash = processFilesByHash(paths)

        # Print results
        for hash in filesByHash:
            files = filesByHash[hash]
            if len(files) == 1:
                continue
            print('{} ({} B) found {} times'.format(hash, size, len(files)))
            for file in files:
                print('    {}'.format(fixUnicode(file)))

 def processFolderBySize(rootDir, dict_out):
    if not os.path.exists(rootDir):
        print('Invalid path: {}'.format(fixUnicode(rootDir)))
        return

    for dir, subdirs, files in os.walk(rootDir, topdown=False):
        #print('Scanning {}'.format(fixUnicode(dir)))

        if not files:
            if not subdirs:
                print('Empty: {}'.format(fixUnicode(dir)))
            continue

        for file in files:
            path = os.path.join(dir, file)
            if not os.path.exists(path):
                print('File disappeared: {}'.format(fixUnicode(path)))
                continue
            # Add entry
            size = os.path.getsize(path)
            if size in dict_out:
                dict_out[size].append(path)
            else:
                dict_out[size] = [path]

 def processFilesByHash(paths):
    hashes = {}
    for path in paths:
        hash = hashPath(path)
        if hash in hashes:
            hashes[hash].append(path)
        else:
            hashes[hash] = [path]
    return hashes

 def hashPath(path, blockSize=1048576):
    hash = hashlib.md5()
    with open(path, 'rb') as file:
        buf = 'Python has no do-while loop'
        while len(buf):
            buf = file.read(blockSize)
            hash.update(buf)
    return hash.hexdigest()

 def fixUnicode(s):
    return s.encode('ascii', 'replace').decode('ascii')

 def main():
    parser = argparse.ArgumentParser(description='Find duplicate files')
    parser.add_argument(
        'dirs', metavar='dir', type=str, nargs='+',
        help='Director(y|ies) to check for duplicates',
        )
    args = parser.parse_args()

    process(args.dirs)

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# Duplicate File Detection
	__copyright__ = "Copyright (C) 2016 Victor Zheng"
	__licence__ = "GNU GPL v3"

	# Based on https://github.com/IanLee1521/utilities/blob/master/utilities/find_duplicates.py

	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.

	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.

	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	import codecs
	import argparse
	import hashlib
	import os
	import sys

	def process(dirs):
	filesBySize = {}
	print('Finding files...')
	for dir in dirs:
	processFolderBySize(dir, filesBySize)

	print('Detecting duplicates...')
	for size in sorted(filesBySize.keys(), reverse=True):
	paths = filesBySize[size]
	if len(paths) == 1:
	continue

	filesByHash = processFilesByHash(paths)

	# Print results
	for hash in filesByHash:
	files = filesByHash[hash]
	if len(files) == 1:
	continue
	print('{} ({} B) found {} times'.format(hash, size, len(files)))
	for file in files:
	print(' {}'.format(fixUnicode(file)))

	def processFolderBySize(rootDir, dict_out):
	if not os.path.exists(rootDir):
	print('Invalid path: {}'.format(fixUnicode(rootDir)))
	return

	for dir, subdirs, files in os.walk(rootDir, topdown=False):
	#print('Scanning {}'.format(fixUnicode(dir)))

	if not files:
	if not subdirs:
	print('Empty: {}'.format(fixUnicode(dir)))
	continue

	for file in files:
	path = os.path.join(dir, file)
	if not os.path.exists(path):
	print('File disappeared: {}'.format(fixUnicode(path)))
	continue
	# Add entry
	size = os.path.getsize(path)
	if size in dict_out:
	dict_out[size].append(path)
	else:
	dict_out[size] = [path]

	def processFilesByHash(paths):
	hashes = {}
	for path in paths:
	hash = hashPath(path)
	if hash in hashes:
	hashes[hash].append(path)
	else:
	hashes[hash] = [path]
	return hashes

	def hashPath(path, blockSize=1048576):
	hash = hashlib.md5()
	with open(path, 'rb') as file:
	buf = 'Python has no do-while loop'
	while len(buf):
	buf = file.read(blockSize)
	hash.update(buf)
	return hash.hexdigest()

	def fixUnicode(s):
	return s.encode('ascii', 'replace').decode('ascii')

	def main():
	parser = argparse.ArgumentParser(description='Find duplicate files')
	parser.add_argument(
	'dirs', metavar='dir', type=str, nargs='+',
	help='Director(y\|ies) to check for duplicates',
	)
	args = parser.parse_args()

	process(args.dirs)

	if __name__ == '__main__':
	main()