Skip to content

Instantly share code, notes, and snippets.

@iqiancheng
Last active June 4, 2025 09:01
Show Gist options
  • Save iqiancheng/d4bb67a258e566fcee9b26359ee2bfbb to your computer and use it in GitHub Desktop.
Save iqiancheng/d4bb67a258e566fcee9b26359ee2bfbb to your computer and use it in GitHub Desktop.
Remove duplicate files from a directory tree on macOS
import os
import re
from collections import defaultdict
def dedup_files(root_dir):
"""
Remove duplicate files from a directory tree.
Deduplication rules:
- Files are grouped by: basename + extension + size
- Files with (n) suffix are considered duplicates
- Only files with duplicate tags like (1), (2) are deleted
- Original files (without tags) are preserved
Examples:
- file.txt (original) + file(1).txt (duplicate) -> file(1).txt deleted
- a.py + a(1).png -> both kept (different extensions)
"""
# Check if root directory exists
if not os.path.exists(root_dir):
print(f"Error: Directory '{root_dir}' does not exist!")
return
file_groups = defaultdict(list)
pattern = re.compile(r"^(.*?)(\(\d+\))?(\.[^.]+)$")
for dirpath, dirnames, filenames in os.walk(root_dir):
for name in filenames:
full_path = os.path.join(dirpath, name)
try:
size = os.path.getsize(full_path)
match = pattern.match(name)
if match:
base, dup_tag, ext = match.groups()
key = (base.strip(), ext, size)
file_groups[key].append((full_path, bool(dup_tag)))
except Exception as e:
print(f"Skip {full_path}: {e}")
# Deduplication process
deleted_count = 0
for files in file_groups.values():
originals = [f for f, is_dup in files if not is_dup]
if originals:
for f, is_dup in files:
if is_dup:
print(f"Deleting duplicate: {f}")
try:
os.remove(f)
deleted_count += 1
except Exception as e:
print(f"Failed to delete {f}: {e}")
if deleted_count > 0:
print(f"Deduplication complete! Deleted {deleted_count} duplicate files.")
else:
print("No duplicate files found.")
# Usage
target_path = "~/Downloads/"
target_path = os.path.expanduser(target_path) if '~' in target_path else target_path
dedup_files(target_path)
import os
import re
from collections import defaultdict
# Define all duplicate patterns with named groups
DUP_PATTERNS = [
re.compile(r'^Copy of (?P<base>.+?)(?P<ext>\.[^.]+)$', re.IGNORECASE),
re.compile(r'^(?P<base>.+?) - Copy \(\d+\)(?P<ext>\.[^.]+)$', re.IGNORECASE),
re.compile(r'^(?P<base>.+?) - Copy(?P<ext>\.[^.]+)$', re.IGNORECASE),
re.compile(r'^(?P<base>.+?) - 副本 \(\d+\)(?P<ext>\.[^.]+)$'),
re.compile(r'^(?P<base>.+?) - 副本(?P<ext>\.[^.]+)$'),
re.compile(r'^副本 (?P<base>.+?)(?P<ext>\.[^.]+)$'),
re.compile(r'^(?P<base>.+?) - 复制 \(\d+\)(?P<ext>\.[^.]+)$'),
re.compile(r'^(?P<base>.+?) - 复制(?P<ext>\.[^.]+)$'),
re.compile(r'^(?P<base>.+?) \(\d+\)(?P<ext>\.[^.]+)$'),
re.compile(r'^(?P<base>.+?)\(\d+\)(?P<ext>\.[^.]+)$'),
]
def parse_filename(name):
for pat in DUP_PATTERNS:
m = pat.match(name)
if m:
return m.group('base').strip(), m.group('ext'), True
base, ext = os.path.splitext(name)
return base.strip(), ext, False
def dedup_files(root_dir):
"""
Remove duplicate files from a directory tree.
Deduplication rules:
- Files are grouped by: basename + extension + size
- Supports macOS, Windows (English/Chinese) duplicate patterns
- Only files with duplicate tags are deleted
- Original files (without tags) are preserved
Examples:
- a.py + a(1).png -> both kept (different extensions)
- document.pdf + "Copy of document.pdf" -> "Copy of document.pdf" deleted
- image.jpg + "image - 副本.jpg" -> "image - 副本.jpg" deleted
"""
if not os.path.exists(root_dir):
print(f"Error: Directory '{root_dir}' does not exist!")
return
file_groups = defaultdict(list)
for dirpath, dirnames, filenames in os.walk(root_dir):
for name in filenames:
full_path = os.path.join(dirpath, name)
try:
size = os.path.getsize(full_path)
base_name, ext, is_duplicate = parse_filename(name)
key = (base_name, ext, size)
file_groups[key].append((full_path, is_duplicate))
except Exception as e:
print(f"Skip {full_path}: {e}")
deleted_count = 0
for files in file_groups.values():
originals = [f for f, is_dup in files if not is_dup]
if originals:
for f, is_dup in files:
if is_dup:
print(f"Deleting duplicate: {f}")
try:
os.remove(f)
deleted_count += 1
except Exception as e:
print(f"Failed to delete {f}: {e}")
if deleted_count > 0:
print(f"Deduplication complete! Deleted {deleted_count} duplicate files.")
else:
print("No duplicate files found.")
# Usage
target_path = "~/Downloads/"
target_path = os.path.expanduser(target_path) if '~' in target_path else target_path
dedup_files(target_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment