Skip to content

Instantly share code, notes, and snippets.

@SylvanG
Created January 5, 2025 09:40
Show Gist options
  • Save SylvanG/6ddc6bc7456014d776af57d2112f1ab2 to your computer and use it in GitHub Desktop.
Save SylvanG/6ddc6bc7456014d776af57d2112f1ab2 to your computer and use it in GitHub Desktop.
A Python script for identifying and managing duplicate files in a directory, specifically targeting files with patterns like filename(1).ext. The script safely handles duplicates by comparing file sizes, respecting macOS tags, and moving files to a trash directory instead of permanent deletion. It includes error handling and logging to ensure sa…
#!/usr/bin/env python3
import re
from collections import defaultdict
from pathlib import Path
import shutil
from typing import Generator
import logging
import xattr
logging.basicConfig(level=logging.INFO)
def find_duplicate_files(directory: Path) -> dict[Path, list[Path]]:
# Regex to match files with "(1)", "(2)", etc. in the name
duplicate_pattern = re.compile(r"(.*)\((\d+)\)(\.[^.]*)?$")
# Dictionary to group files by their base name
file_groups = defaultdict(list)
# Scan the directory
for file_path in directory.iterdir():
if not file_path.is_file():
continue
filename = file_path.name
match = duplicate_pattern.match(filename)
if match:
base_name, _, extension = match.groups()
original_name = Path(f"{base_name.strip()}{extension or ''}")
original_file = directory / original_name
file_groups[original_file].append(file_path)
return file_groups
def check_duplicate_files(directory: Path) -> Generator[tuple[Path, list[Path]], None, None]:
duplicates = find_duplicate_files(directory)
for original, files in duplicates.items():
# Skip if original file doesn't exist
if not original.exists():
logging.warning(f"Original file does not exist: {original}")
continue
# Skip if no duplicate files found
if not files:
continue
try:
original_size = original.stat().st_size
# Check if all duplicates have the same size as original
if not all(file.stat().st_size == original_size for file in files):
logging.warning(f"File sizes don't match: {original} vs {files}")
continue
if any(has_macos_tags(file) for file in files):
logging.warning(f"File has macOS tags: {original}")
continue
yield original, files
except OSError as e:
logging.error(f"Error accessing files: {e}")
continue
def has_macos_tags(file_path: Path) -> bool:
try:
# Get extended attributes of the file
attributes = xattr.listxattr(file_path)
# Check if any of the attributes relate to macOS tags
for attr in attributes:
if "com.apple.metadata:_kMDItemUserTags" in attr:
return True
return False
except Exception as e:
print(f"Error checking tags for {file_path}: {e}")
return False
def delete_duplicate_files(original: Path, files: list[Path], trash_dir: Path) -> None:
for file in files:
file.rename(trash_dir / file.name)
target_path = trash_dir / original.name
shutil.copy2(original, target_path)
def find_and_delete_duplicates(directory: Path, trash_dir: Path | None = None) -> None:
if trash_dir is None:
trash_dir = directory / "trash"
trash_dir.mkdir(parents=True, exist_ok=True)
for original, files in check_duplicate_files(directory):
delete_duplicate_files(original, files, trash_dir)
logging.info(f"Deleted {len(files)} duplicates of {original}")
if __name__ == "__main__":
directory_path = input("Enter the directory path to scan for duplicates: ").strip()
directory_path = Path(directory_path)
if directory_path.is_dir():
find_and_delete_duplicates(directory_path)
else:
print("Invalid directory path!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment