Dmitry-Klymenko · July 16, 2024 06:05
diff --git a/duplicate_file_finder.py b/duplicate_file_finder.py
 #!/usr/bin/env python2.7
 # -*- coding: utf-8 -*-
 # This script is intended to run under Python 2.7

 """
 Description:
 This script is designed to identify and delete duplicate files within a specified directory. It compares files in two directories: 
 one containing the files to check for duplicates and the other containing the original files to compare against. If a duplicate 
 is found, the script can either report the duplicate or delete it, based on the provided command-line arguments.

 Key Features:
 1. Recursively scans directories to gather all files.
 2. Excludes certain directories from the scan (e.g., directories starting with '@' or named '#recycle').
 3. Compares files by name, size, and content to determine if they are duplicates.
 4. Optionally deletes identified duplicates.
 5. Deletes empty directories after processing.

 Usage:
 To run the script, you must provide two directory paths as arguments:
 1. pathToDeleteDuplicates: The path to the directory containing files that may be duplicates and need to be checked.
 2. pathToSearchOriginals: The path to the directory containing the original files for comparison.

 Additionally, an optional '-w' flag can be provided to delete duplicates instead of just reporting them.

 Example:
 python script.py /path/to/delete/duplicates /path/to/search/originals -w

 Dependencies:
 - Python 2.7
 - os, sys, subprocess, argparse modules

 Note:
 - This script sets the default encoding to utf-8 to handle non-ASCII file names.
 - Ensure you have the necessary permissions to delete files and directories if using the '-w' flag.

 Functions:
 1. get_all_files(path, exclude_path=None): Recursively retrieves all files from a directory, excluding specified paths.
 2. compare_first_n_bytes(file1, file2, n=156): Compares the first 'n' bytes of two files.
 3. compare_files(file1, file2): Uses the 'cmp' command to compare two files.
 4. is_duplicate(file1, file2): Checks if two files are duplicates based on name, size, and content.
 5. delete_empty_dirs(path): Deletes empty directories within a specified path.
 6. delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False): Identifies and optionally deletes duplicates.

 """

 import os
 import sys
 import subprocess
 import argparse

 def get_all_files(path, exclude_path=None):
    """Get all files in the given directory and subdirectories, excluding the exclude_path and its subdirectories."""
    all_files = []
    for root, dirs, files in os.walk(path):
        # Exclude directories starting with '@' and '#recycle'
        dirs[:] = [d for d in dirs if not d.startswith('@') and d != '#recycle']
        normalized_root = os.path.normpath(root)
        
        # Skip the directories that match the exclude_path
        #TODO - review and fix edge cases, like setting up invalid combination of path1 and path2
        if exclude_path and (normalized_root == exclude_path or normalized_root.startswith(exclude_path + os.sep)):
            continue
            
        # Add files to the list
        for file in files:
            all_files.append(os.path.join(root, file))
    
    return all_files

 def compare_first_n_bytes(file1, file2, n=156):
    """Compare the first n bytes of two files."""
    try:
        with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
            return f1.read(n) == f2.read(n)
    except IOError as e:
        print("Error reading files:", e)
        return False

 def compare_files(file1, file2):
    """Use the cmp command to compare two files."""
    result = subprocess.call(['cmp', file1, file2], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return result == 0  # True if files are identical

 def is_duplicate(file1, file2):
    """Check if two files are duplicates based on name, size, and content."""
    if os.path.basename(file1) != os.path.basename(file2):
        return False
        
    if os.path.getsize(file1) != os.path.getsize(file2):
        return False
        
    if compare_files(file1, file2) == False:
        return False
        
    return True

 def delete_empty_dirs(path):
    """Delete empty directories."""
    for root, dirs, _ in os.walk(path, topdown=False):
        for dir in dirs:
            dir_path = os.path.join(root, dir)
            if not os.listdir(dir_path):
                try:
                    os.rmdir(dir_path)
                    print("Deleted empty directory:", dir_path)
                except OSError as e:
                    print("Error deleting directory:", dir_path, "Error:", e)

 def delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False):
    """Delete duplicates from pathToDeleteDuplicates if found in pathToSearchOriginals."""
    files_to_delete = get_all_files(pathToDeleteDuplicates)
    files_to_search = get_all_files(pathToSearchOriginals, pathToDeleteDuplicates if pathToDeleteDuplicates.startswith(pathToSearchOriginals) else None)

    for file_to_delete in files_to_delete:
        for file_to_search in files_to_search:
            if is_duplicate(file_to_delete, file_to_search):
                print("Duplicate", file_to_delete, 'Original', file_to_search)
                if delete:
                    try:
                        os.remove(file_to_delete)
                    except OSError as e:
                        print("Error deleting file:", file_to_delete, "Error:", e)
                break

    delete_empty_dirs(pathToDeleteDuplicates)

 if __name__ == "__main__":
    # Reload sys and set default encoding to utf-8 for handling non-ASCII file names
    reload(sys)
    sys.setdefaultencoding('utf-8')

    # Setup argument parser with a description
    parser = argparse.ArgumentParser(description='Delete duplicate files and remove empty directories.')
    parser.add_argument('pathToDeleteDuplicates', help='Path to the directory containing duplicates to delete.')
    parser.add_argument('pathToSearchOriginals', help='Path to the directory containing original files.')
    parser.add_argument('-w', action='store_true', help='Delete duplicates instead of just reporting them.')

    # Parse arguments
    args = parser.parse_args()

    pathToDeleteDuplicates = args.pathToDeleteDuplicates
    pathToSearchOriginals = args.pathToSearchOriginals

    # Check if the pathToDeleteDuplicates is a valid directory
    if not os.path.isdir(pathToDeleteDuplicates):
        print("Error: First parameter pathToDeleteDuplicates must be a valid directory:", pathToDeleteDuplicates)
        sys.exit(1)

    # Check if the pathToSearchOriginals is a valid directory
    if not os.path.isdir(pathToSearchOriginals):
        print("Error: Second parameter pathToSearchOriginals must be a valid directory:", pathToSearchOriginals)
        sys.exit(1)

    print("Going over all files in '{}' and flagging duplicate files that are found in {}".format(pathToDeleteDuplicates, pathToSearchOriginals))
    if args.w:
        print(" -w flag was specified. Script WILL DELETE duplicate files")
        
    # Call the delete_duplicates function with the provided paths and delete flag
    delete_duplicates(os.path.normpath(pathToDeleteDuplicates), os.path.normpath(pathToSearchOriginals), delete=args.w)
    
    # Print completion message
    print("All done")
	#!/usr/bin/env python2.7
	# -- coding: utf-8 --
	# This script is intended to run under Python 2.7

	"""
	Description:
	This script is designed to identify and delete duplicate files within a specified directory. It compares files in two directories:
	one containing the files to check for duplicates and the other containing the original files to compare against. If a duplicate
	is found, the script can either report the duplicate or delete it, based on the provided command-line arguments.

	Key Features:
	1. Recursively scans directories to gather all files.
	2. Excludes certain directories from the scan (e.g., directories starting with '@' or named '#recycle').
	3. Compares files by name, size, and content to determine if they are duplicates.
	4. Optionally deletes identified duplicates.
	5. Deletes empty directories after processing.

	Usage:
	To run the script, you must provide two directory paths as arguments:
	1. pathToDeleteDuplicates: The path to the directory containing files that may be duplicates and need to be checked.
	2. pathToSearchOriginals: The path to the directory containing the original files for comparison.

	Additionally, an optional '-w' flag can be provided to delete duplicates instead of just reporting them.

	Example:
	python script.py /path/to/delete/duplicates /path/to/search/originals -w

	Dependencies:
	- Python 2.7
	- os, sys, subprocess, argparse modules

	Note:
	- This script sets the default encoding to utf-8 to handle non-ASCII file names.
	- Ensure you have the necessary permissions to delete files and directories if using the '-w' flag.

	Functions:
	1. get_all_files(path, exclude_path=None): Recursively retrieves all files from a directory, excluding specified paths.
	2. compare_first_n_bytes(file1, file2, n=156): Compares the first 'n' bytes of two files.
	3. compare_files(file1, file2): Uses the 'cmp' command to compare two files.
	4. is_duplicate(file1, file2): Checks if two files are duplicates based on name, size, and content.
	5. delete_empty_dirs(path): Deletes empty directories within a specified path.
	6. delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False): Identifies and optionally deletes duplicates.

	"""

	import os
	import sys
	import subprocess
	import argparse

	def get_all_files(path, exclude_path=None):
	"""Get all files in the given directory and subdirectories, excluding the exclude_path and its subdirectories."""
	all_files = []
	for root, dirs, files in os.walk(path):
	# Exclude directories starting with '@' and '#recycle'
	dirs[:] = [d for d in dirs if not d.startswith('@') and d != '#recycle']
	normalized_root = os.path.normpath(root)

	# Skip the directories that match the exclude_path
	#TODO - review and fix edge cases, like setting up invalid combination of path1 and path2
	if exclude_path and (normalized_root == exclude_path or normalized_root.startswith(exclude_path + os.sep)):
	continue

	# Add files to the list
	for file in files:
	all_files.append(os.path.join(root, file))

	return all_files

	def compare_first_n_bytes(file1, file2, n=156):
	"""Compare the first n bytes of two files."""
	try:
	with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
	return f1.read(n) == f2.read(n)
	except IOError as e:
	print("Error reading files:", e)
	return False

	def compare_files(file1, file2):
	"""Use the cmp command to compare two files."""
	result = subprocess.call(['cmp', file1, file2], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	return result == 0 # True if files are identical

	def is_duplicate(file1, file2):
	"""Check if two files are duplicates based on name, size, and content."""
	if os.path.basename(file1) != os.path.basename(file2):
	return False

	if os.path.getsize(file1) != os.path.getsize(file2):
	return False

	if compare_files(file1, file2) == False:
	return False

	return True

	def delete_empty_dirs(path):
	"""Delete empty directories."""
	for root, dirs, _ in os.walk(path, topdown=False):
	for dir in dirs:
	dir_path = os.path.join(root, dir)
	if not os.listdir(dir_path):
	try:
	os.rmdir(dir_path)
	print("Deleted empty directory:", dir_path)
	except OSError as e:
	print("Error deleting directory:", dir_path, "Error:", e)

	def delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False):
	"""Delete duplicates from pathToDeleteDuplicates if found in pathToSearchOriginals."""
	files_to_delete = get_all_files(pathToDeleteDuplicates)
	files_to_search = get_all_files(pathToSearchOriginals, pathToDeleteDuplicates if pathToDeleteDuplicates.startswith(pathToSearchOriginals) else None)

	for file_to_delete in files_to_delete:
	for file_to_search in files_to_search:
	if is_duplicate(file_to_delete, file_to_search):
	print("Duplicate", file_to_delete, 'Original', file_to_search)
	if delete:
	try:
	os.remove(file_to_delete)
	except OSError as e:
	print("Error deleting file:", file_to_delete, "Error:", e)
	break

	delete_empty_dirs(pathToDeleteDuplicates)

	if __name__ == "__main__":
	# Reload sys and set default encoding to utf-8 for handling non-ASCII file names
	reload(sys)
	sys.setdefaultencoding('utf-8')

	# Setup argument parser with a description
	parser = argparse.ArgumentParser(description='Delete duplicate files and remove empty directories.')
	parser.add_argument('pathToDeleteDuplicates', help='Path to the directory containing duplicates to delete.')
	parser.add_argument('pathToSearchOriginals', help='Path to the directory containing original files.')
	parser.add_argument('-w', action='store_true', help='Delete duplicates instead of just reporting them.')

	# Parse arguments
	args = parser.parse_args()

	pathToDeleteDuplicates = args.pathToDeleteDuplicates
	pathToSearchOriginals = args.pathToSearchOriginals

	# Check if the pathToDeleteDuplicates is a valid directory
	if not os.path.isdir(pathToDeleteDuplicates):
	print("Error: First parameter pathToDeleteDuplicates must be a valid directory:", pathToDeleteDuplicates)
	sys.exit(1)

	# Check if the pathToSearchOriginals is a valid directory
	if not os.path.isdir(pathToSearchOriginals):
	print("Error: Second parameter pathToSearchOriginals must be a valid directory:", pathToSearchOriginals)
	sys.exit(1)

	print("Going over all files in '{}' and flagging duplicate files that are found in {}".format(pathToDeleteDuplicates, pathToSearchOriginals))
	if args.w:
	print(" -w flag was specified. Script WILL DELETE duplicate files")

	# Call the delete_duplicates function with the provided paths and delete flag
	delete_duplicates(os.path.normpath(pathToDeleteDuplicates), os.path.normpath(pathToSearchOriginals), delete=args.w)

	# Print completion message
	print("All done")