Created
January 5, 2025 09:40
-
-
Save SylvanG/6ddc6bc7456014d776af57d2112f1ab2 to your computer and use it in GitHub Desktop.
A Python script for identifying and managing duplicate files in a directory, specifically targeting files with patterns like filename(1).ext. The script safely handles duplicates by comparing file sizes, respecting macOS tags, and moving files to a trash directory instead of permanent deletion. It includes error handling and logging to ensure sa…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
from collections import defaultdict | |
from pathlib import Path | |
import shutil | |
from typing import Generator | |
import logging | |
import xattr | |
logging.basicConfig(level=logging.INFO) | |
def find_duplicate_files(directory: Path) -> dict[Path, list[Path]]: | |
# Regex to match files with "(1)", "(2)", etc. in the name | |
duplicate_pattern = re.compile(r"(.*)\((\d+)\)(\.[^.]*)?$") | |
# Dictionary to group files by their base name | |
file_groups = defaultdict(list) | |
# Scan the directory | |
for file_path in directory.iterdir(): | |
if not file_path.is_file(): | |
continue | |
filename = file_path.name | |
match = duplicate_pattern.match(filename) | |
if match: | |
base_name, _, extension = match.groups() | |
original_name = Path(f"{base_name.strip()}{extension or ''}") | |
original_file = directory / original_name | |
file_groups[original_file].append(file_path) | |
return file_groups | |
def check_duplicate_files(directory: Path) -> Generator[tuple[Path, list[Path]], None, None]: | |
duplicates = find_duplicate_files(directory) | |
for original, files in duplicates.items(): | |
# Skip if original file doesn't exist | |
if not original.exists(): | |
logging.warning(f"Original file does not exist: {original}") | |
continue | |
# Skip if no duplicate files found | |
if not files: | |
continue | |
try: | |
original_size = original.stat().st_size | |
# Check if all duplicates have the same size as original | |
if not all(file.stat().st_size == original_size for file in files): | |
logging.warning(f"File sizes don't match: {original} vs {files}") | |
continue | |
if any(has_macos_tags(file) for file in files): | |
logging.warning(f"File has macOS tags: {original}") | |
continue | |
yield original, files | |
except OSError as e: | |
logging.error(f"Error accessing files: {e}") | |
continue | |
def has_macos_tags(file_path: Path) -> bool: | |
try: | |
# Get extended attributes of the file | |
attributes = xattr.listxattr(file_path) | |
# Check if any of the attributes relate to macOS tags | |
for attr in attributes: | |
if "com.apple.metadata:_kMDItemUserTags" in attr: | |
return True | |
return False | |
except Exception as e: | |
print(f"Error checking tags for {file_path}: {e}") | |
return False | |
def delete_duplicate_files(original: Path, files: list[Path], trash_dir: Path) -> None: | |
for file in files: | |
file.rename(trash_dir / file.name) | |
target_path = trash_dir / original.name | |
shutil.copy2(original, target_path) | |
def find_and_delete_duplicates(directory: Path, trash_dir: Path | None = None) -> None: | |
if trash_dir is None: | |
trash_dir = directory / "trash" | |
trash_dir.mkdir(parents=True, exist_ok=True) | |
for original, files in check_duplicate_files(directory): | |
delete_duplicate_files(original, files, trash_dir) | |
logging.info(f"Deleted {len(files)} duplicates of {original}") | |
if __name__ == "__main__": | |
directory_path = input("Enter the directory path to scan for duplicates: ").strip() | |
directory_path = Path(directory_path) | |
if directory_path.is_dir(): | |
find_and_delete_duplicates(directory_path) | |
else: | |
print("Invalid directory path!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment