Skip to content

Instantly share code, notes, and snippets.

@Dmitry-Klymenko
Created July 16, 2024 06:05
Show Gist options
  • Save Dmitry-Klymenko/65b69c1727407228274c879191528cec to your computer and use it in GitHub Desktop.
Save Dmitry-Klymenko/65b69c1727407228274c879191528cec to your computer and use it in GitHub Desktop.
This script is designed to identify and delete duplicate files within a specified directory. It compares files in two directories: one containing the files to check for duplicates and the other containing the original files to compare against. If a duplicate is found, the script can either report the duplicate or delete it, based on the provided…
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
# This script is intended to run under Python 2.7
"""
Description:
This script is designed to identify and delete duplicate files within a specified directory. It compares files in two directories:
one containing the files to check for duplicates and the other containing the original files to compare against. If a duplicate
is found, the script can either report the duplicate or delete it, based on the provided command-line arguments.
Key Features:
1. Recursively scans directories to gather all files.
2. Excludes certain directories from the scan (e.g., directories starting with '@' or named '#recycle').
3. Compares files by name, size, and content to determine if they are duplicates.
4. Optionally deletes identified duplicates.
5. Deletes empty directories after processing.
Usage:
To run the script, you must provide two directory paths as arguments:
1. pathToDeleteDuplicates: The path to the directory containing files that may be duplicates and need to be checked.
2. pathToSearchOriginals: The path to the directory containing the original files for comparison.
Additionally, an optional '-w' flag can be provided to delete duplicates instead of just reporting them.
Example:
python script.py /path/to/delete/duplicates /path/to/search/originals -w
Dependencies:
- Python 2.7
- os, sys, subprocess, argparse modules
Note:
- This script sets the default encoding to utf-8 to handle non-ASCII file names.
- Ensure you have the necessary permissions to delete files and directories if using the '-w' flag.
Functions:
1. get_all_files(path, exclude_path=None): Recursively retrieves all files from a directory, excluding specified paths.
2. compare_first_n_bytes(file1, file2, n=156): Compares the first 'n' bytes of two files.
3. compare_files(file1, file2): Uses the 'cmp' command to compare two files.
4. is_duplicate(file1, file2): Checks if two files are duplicates based on name, size, and content.
5. delete_empty_dirs(path): Deletes empty directories within a specified path.
6. delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False): Identifies and optionally deletes duplicates.
"""
import os
import sys
import subprocess
import argparse
def get_all_files(path, exclude_path=None):
"""Get all files in the given directory and subdirectories, excluding the exclude_path and its subdirectories."""
all_files = []
for root, dirs, files in os.walk(path):
# Exclude directories starting with '@' and '#recycle'
dirs[:] = [d for d in dirs if not d.startswith('@') and d != '#recycle']
normalized_root = os.path.normpath(root)
# Skip the directories that match the exclude_path
#TODO - review and fix edge cases, like setting up invalid combination of path1 and path2
if exclude_path and (normalized_root == exclude_path or normalized_root.startswith(exclude_path + os.sep)):
continue
# Add files to the list
for file in files:
all_files.append(os.path.join(root, file))
return all_files
def compare_first_n_bytes(file1, file2, n=156):
"""Compare the first n bytes of two files."""
try:
with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
return f1.read(n) == f2.read(n)
except IOError as e:
print("Error reading files:", e)
return False
def compare_files(file1, file2):
"""Use the cmp command to compare two files."""
result = subprocess.call(['cmp', file1, file2], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return result == 0 # True if files are identical
def is_duplicate(file1, file2):
"""Check if two files are duplicates based on name, size, and content."""
if os.path.basename(file1) != os.path.basename(file2):
return False
if os.path.getsize(file1) != os.path.getsize(file2):
return False
if compare_files(file1, file2) == False:
return False
return True
def delete_empty_dirs(path):
"""Delete empty directories."""
for root, dirs, _ in os.walk(path, topdown=False):
for dir in dirs:
dir_path = os.path.join(root, dir)
if not os.listdir(dir_path):
try:
os.rmdir(dir_path)
print("Deleted empty directory:", dir_path)
except OSError as e:
print("Error deleting directory:", dir_path, "Error:", e)
def delete_duplicates(pathToDeleteDuplicates, pathToSearchOriginals, delete=False):
"""Delete duplicates from pathToDeleteDuplicates if found in pathToSearchOriginals."""
files_to_delete = get_all_files(pathToDeleteDuplicates)
files_to_search = get_all_files(pathToSearchOriginals, pathToDeleteDuplicates if pathToDeleteDuplicates.startswith(pathToSearchOriginals) else None)
for file_to_delete in files_to_delete:
for file_to_search in files_to_search:
if is_duplicate(file_to_delete, file_to_search):
print("Duplicate", file_to_delete, 'Original', file_to_search)
if delete:
try:
os.remove(file_to_delete)
except OSError as e:
print("Error deleting file:", file_to_delete, "Error:", e)
break
delete_empty_dirs(pathToDeleteDuplicates)
if __name__ == "__main__":
# Reload sys and set default encoding to utf-8 for handling non-ASCII file names
reload(sys)
sys.setdefaultencoding('utf-8')
# Setup argument parser with a description
parser = argparse.ArgumentParser(description='Delete duplicate files and remove empty directories.')
parser.add_argument('pathToDeleteDuplicates', help='Path to the directory containing duplicates to delete.')
parser.add_argument('pathToSearchOriginals', help='Path to the directory containing original files.')
parser.add_argument('-w', action='store_true', help='Delete duplicates instead of just reporting them.')
# Parse arguments
args = parser.parse_args()
pathToDeleteDuplicates = args.pathToDeleteDuplicates
pathToSearchOriginals = args.pathToSearchOriginals
# Check if the pathToDeleteDuplicates is a valid directory
if not os.path.isdir(pathToDeleteDuplicates):
print("Error: First parameter pathToDeleteDuplicates must be a valid directory:", pathToDeleteDuplicates)
sys.exit(1)
# Check if the pathToSearchOriginals is a valid directory
if not os.path.isdir(pathToSearchOriginals):
print("Error: Second parameter pathToSearchOriginals must be a valid directory:", pathToSearchOriginals)
sys.exit(1)
print("Going over all files in '{}' and flagging duplicate files that are found in {}".format(pathToDeleteDuplicates, pathToSearchOriginals))
if args.w:
print(" -w flag was specified. Script WILL DELETE duplicate files")
# Call the delete_duplicates function with the provided paths and delete flag
delete_duplicates(os.path.normpath(pathToDeleteDuplicates), os.path.normpath(pathToSearchOriginals), delete=args.w)
# Print completion message
print("All done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment