Last active
June 4, 2025 09:01
-
-
Save iqiancheng/d4bb67a258e566fcee9b26359ee2bfbb to your computer and use it in GitHub Desktop.
Remove duplicate files from a directory tree on macOS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from collections import defaultdict | |
def dedup_files(root_dir): | |
""" | |
Remove duplicate files from a directory tree. | |
Deduplication rules: | |
- Files are grouped by: basename + extension + size | |
- Files with (n) suffix are considered duplicates | |
- Only files with duplicate tags like (1), (2) are deleted | |
- Original files (without tags) are preserved | |
Examples: | |
- file.txt (original) + file(1).txt (duplicate) -> file(1).txt deleted | |
- a.py + a(1).png -> both kept (different extensions) | |
""" | |
# Check if root directory exists | |
if not os.path.exists(root_dir): | |
print(f"Error: Directory '{root_dir}' does not exist!") | |
return | |
file_groups = defaultdict(list) | |
pattern = re.compile(r"^(.*?)(\(\d+\))?(\.[^.]+)$") | |
for dirpath, dirnames, filenames in os.walk(root_dir): | |
for name in filenames: | |
full_path = os.path.join(dirpath, name) | |
try: | |
size = os.path.getsize(full_path) | |
match = pattern.match(name) | |
if match: | |
base, dup_tag, ext = match.groups() | |
key = (base.strip(), ext, size) | |
file_groups[key].append((full_path, bool(dup_tag))) | |
except Exception as e: | |
print(f"Skip {full_path}: {e}") | |
# Deduplication process | |
deleted_count = 0 | |
for files in file_groups.values(): | |
originals = [f for f, is_dup in files if not is_dup] | |
if originals: | |
for f, is_dup in files: | |
if is_dup: | |
print(f"Deleting duplicate: {f}") | |
try: | |
os.remove(f) | |
deleted_count += 1 | |
except Exception as e: | |
print(f"Failed to delete {f}: {e}") | |
if deleted_count > 0: | |
print(f"Deduplication complete! Deleted {deleted_count} duplicate files.") | |
else: | |
print("No duplicate files found.") | |
# Usage | |
target_path = "~/Downloads/" | |
target_path = os.path.expanduser(target_path) if '~' in target_path else target_path | |
dedup_files(target_path) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from collections import defaultdict | |
# Define all duplicate patterns with named groups | |
DUP_PATTERNS = [ | |
re.compile(r'^Copy of (?P<base>.+?)(?P<ext>\.[^.]+)$', re.IGNORECASE), | |
re.compile(r'^(?P<base>.+?) - Copy \(\d+\)(?P<ext>\.[^.]+)$', re.IGNORECASE), | |
re.compile(r'^(?P<base>.+?) - Copy(?P<ext>\.[^.]+)$', re.IGNORECASE), | |
re.compile(r'^(?P<base>.+?) - 副本 \(\d+\)(?P<ext>\.[^.]+)$'), | |
re.compile(r'^(?P<base>.+?) - 副本(?P<ext>\.[^.]+)$'), | |
re.compile(r'^副本 (?P<base>.+?)(?P<ext>\.[^.]+)$'), | |
re.compile(r'^(?P<base>.+?) - 复制 \(\d+\)(?P<ext>\.[^.]+)$'), | |
re.compile(r'^(?P<base>.+?) - 复制(?P<ext>\.[^.]+)$'), | |
re.compile(r'^(?P<base>.+?) \(\d+\)(?P<ext>\.[^.]+)$'), | |
re.compile(r'^(?P<base>.+?)\(\d+\)(?P<ext>\.[^.]+)$'), | |
] | |
def parse_filename(name): | |
for pat in DUP_PATTERNS: | |
m = pat.match(name) | |
if m: | |
return m.group('base').strip(), m.group('ext'), True | |
base, ext = os.path.splitext(name) | |
return base.strip(), ext, False | |
def dedup_files(root_dir): | |
""" | |
Remove duplicate files from a directory tree. | |
Deduplication rules: | |
- Files are grouped by: basename + extension + size | |
- Supports macOS, Windows (English/Chinese) duplicate patterns | |
- Only files with duplicate tags are deleted | |
- Original files (without tags) are preserved | |
Examples: | |
- a.py + a(1).png -> both kept (different extensions) | |
- document.pdf + "Copy of document.pdf" -> "Copy of document.pdf" deleted | |
- image.jpg + "image - 副本.jpg" -> "image - 副本.jpg" deleted | |
""" | |
if not os.path.exists(root_dir): | |
print(f"Error: Directory '{root_dir}' does not exist!") | |
return | |
file_groups = defaultdict(list) | |
for dirpath, dirnames, filenames in os.walk(root_dir): | |
for name in filenames: | |
full_path = os.path.join(dirpath, name) | |
try: | |
size = os.path.getsize(full_path) | |
base_name, ext, is_duplicate = parse_filename(name) | |
key = (base_name, ext, size) | |
file_groups[key].append((full_path, is_duplicate)) | |
except Exception as e: | |
print(f"Skip {full_path}: {e}") | |
deleted_count = 0 | |
for files in file_groups.values(): | |
originals = [f for f, is_dup in files if not is_dup] | |
if originals: | |
for f, is_dup in files: | |
if is_dup: | |
print(f"Deleting duplicate: {f}") | |
try: | |
os.remove(f) | |
deleted_count += 1 | |
except Exception as e: | |
print(f"Failed to delete {f}: {e}") | |
if deleted_count > 0: | |
print(f"Deduplication complete! Deleted {deleted_count} duplicate files.") | |
else: | |
print("No duplicate files found.") | |
# Usage | |
target_path = "~/Downloads/" | |
target_path = os.path.expanduser(target_path) if '~' in target_path else target_path | |
dedup_files(target_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment