Created
July 16, 2024 06:05
-
-
Save Dmitry-Klymenko/65b69c1727407228274c879191528cec to your computer and use it in GitHub Desktop.
This script is designed to identify and delete duplicate files within a specified directory. It compares files in two directories: one containing the files to check for duplicates and the other containing the original files to compare against. If a duplicate is found, the script can either report the duplicate or delete it, based on the provided…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
# -*- coding: utf-8 -*- | |
# This script is intended to run under Python 2.7 | |
""" | |
Description: | |
This script is designed to identify and delete duplicate files within a specified directory. It compares files in two directories: | |
one containing the files to check for duplicates and the other containing the original files to compare against. If a duplicate | |
is found, the script can either report the duplicate or delete it, based on the provided command-line arguments. | |
Key Features: | |
1. Recursively scans directories to gather all files. | |
2. Excludes certain directories from the scan (e.g., directories starting with '@' or named '#recycle'). | |
3. Compares files by name, size, and content to determine if they are duplicates. | |
4. Optionally deletes identified duplicates. | |
5. Deletes empty directories after processing. | |
Usage: | |
To run the script, you must provide two directory paths as arguments: | |
1. pathToDeleteDuplicates: The path to the directory containing files that may be duplicates and need to be checked. | |
2. pathToSearchOriginals: The path to the directory containing the original files for comparison. | |
Additionally, an optional '-w' flag can be provided to delete duplicates instead of just reporting them. | |
Example: | |
python script.py /path/to/delete/duplicates /path/to/search/originals -w | |
Dependencies: | |
- Python 2.7 | |
- os, sys, subprocess, argparse modules | |
Note: | |
- This script sets the default encoding to utf-8 to handle non-ASCII file names. | |
- Ensure you have the necessary permissions to delete files and directories if using the '-w' flag. | |
Functions: | |
1. get_all_files(path, exclude_path=None): Recursively retrieves all files from a directory, excluding specified paths. | |
2. compare_first_n_bytes(file1, file2, n=156): Compares the first 'n' bytes of two files. | |
3. compare_files(file1, file2): Uses the 'cmp' command to compare two files. | |
4. is_duplicate(file1, file2): Checks if two files are duplicates based on name, size, and content. | |
5. delete_empty_dirs(path): Deletes empty directories within a specified path. | |
6. delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False): Identifies and optionally deletes duplicates. | |
""" | |
import os | |
import sys | |
import subprocess | |
import argparse | |
def get_all_files(path, exclude_path=None): | |
"""Get all files in the given directory and subdirectories, excluding the exclude_path and its subdirectories.""" | |
all_files = [] | |
for root, dirs, files in os.walk(path): | |
# Exclude directories starting with '@' and '#recycle' | |
dirs[:] = [d for d in dirs if not d.startswith('@') and d != '#recycle'] | |
normalized_root = os.path.normpath(root) | |
# Skip the directories that match the exclude_path | |
#TODO - review and fix edge cases, like setting up invalid combination of path1 and path2 | |
if exclude_path and (normalized_root == exclude_path or normalized_root.startswith(exclude_path + os.sep)): | |
continue | |
# Add files to the list | |
for file in files: | |
all_files.append(os.path.join(root, file)) | |
return all_files | |
def compare_first_n_bytes(file1, file2, n=156): | |
"""Compare the first n bytes of two files.""" | |
try: | |
with open(file1, 'rb') as f1, open(file2, 'rb') as f2: | |
return f1.read(n) == f2.read(n) | |
except IOError as e: | |
print("Error reading files:", e) | |
return False | |
def compare_files(file1, file2): | |
"""Use the cmp command to compare two files.""" | |
result = subprocess.call(['cmp', file1, file2], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
return result == 0 # True if files are identical | |
def is_duplicate(file1, file2): | |
"""Check if two files are duplicates based on name, size, and content.""" | |
if os.path.basename(file1) != os.path.basename(file2): | |
return False | |
if os.path.getsize(file1) != os.path.getsize(file2): | |
return False | |
if compare_files(file1, file2) == False: | |
return False | |
return True | |
def delete_empty_dirs(path): | |
"""Delete empty directories.""" | |
for root, dirs, _ in os.walk(path, topdown=False): | |
for dir in dirs: | |
dir_path = os.path.join(root, dir) | |
if not os.listdir(dir_path): | |
try: | |
os.rmdir(dir_path) | |
print("Deleted empty directory:", dir_path) | |
except OSError as e: | |
print("Error deleting directory:", dir_path, "Error:", e) | |
def delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False): | |
"""Delete duplicates from pathToDeleteDuplicates if found in pathToSearchOriginals.""" | |
files_to_delete = get_all_files(pathToDeleteDuplicates) | |
files_to_search = get_all_files(pathToSearchOriginals, pathToDeleteDuplicates if pathToDeleteDuplicates.startswith(pathToSearchOriginals) else None) | |
for file_to_delete in files_to_delete: | |
for file_to_search in files_to_search: | |
if is_duplicate(file_to_delete, file_to_search): | |
print("Duplicate", file_to_delete, 'Original', file_to_search) | |
if delete: | |
try: | |
os.remove(file_to_delete) | |
except OSError as e: | |
print("Error deleting file:", file_to_delete, "Error:", e) | |
break | |
delete_empty_dirs(pathToDeleteDuplicates) | |
if __name__ == "__main__": | |
# Reload sys and set default encoding to utf-8 for handling non-ASCII file names | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
# Setup argument parser with a description | |
parser = argparse.ArgumentParser(description='Delete duplicate files and remove empty directories.') | |
parser.add_argument('pathToDeleteDuplicates', help='Path to the directory containing duplicates to delete.') | |
parser.add_argument('pathToSearchOriginals', help='Path to the directory containing original files.') | |
parser.add_argument('-w', action='store_true', help='Delete duplicates instead of just reporting them.') | |
# Parse arguments | |
args = parser.parse_args() | |
pathToDeleteDuplicates = args.pathToDeleteDuplicates | |
pathToSearchOriginals = args.pathToSearchOriginals | |
# Check if the pathToDeleteDuplicates is a valid directory | |
if not os.path.isdir(pathToDeleteDuplicates): | |
print("Error: First parameter pathToDeleteDuplicates must be a valid directory:", pathToDeleteDuplicates) | |
sys.exit(1) | |
# Check if the pathToSearchOriginals is a valid directory | |
if not os.path.isdir(pathToSearchOriginals): | |
print("Error: Second parameter pathToSearchOriginals must be a valid directory:", pathToSearchOriginals) | |
sys.exit(1) | |
print("Going over all files in '{}' and flagging duplicate files that are found in {}".format(pathToDeleteDuplicates, pathToSearchOriginals)) | |
if args.w: | |
print(" -w flag was specified. Script WILL DELETE duplicate files") | |
# Call the delete_duplicates function with the provided paths and delete flag | |
delete_duplicates(os.path.normpath(pathToDeleteDuplicates), os.path.normpath(pathToSearchOriginals), delete=args.w) | |
# Print completion message | |
print("All done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment