iqiancheng · June 4, 2025 09:01
diff --git a/dedup_files_for_macos.py b/dedup_files_for_macos.py
 import os
 import re
 from collections import defaultdict

 def dedup_files(root_dir):
    """
    Remove duplicate files from a directory tree.
    
    Deduplication rules:
    - Files are grouped by: basename + extension + size
    - Files with (n) suffix are considered duplicates
    - Only files with duplicate tags like (1), (2) are deleted
    - Original files (without tags) are preserved
    
    Examples:
    - file.txt (original) + file(1).txt (duplicate) -> file(1).txt deleted
    - a.py + a(1).png -> both kept (different extensions)
    """
    # Check if root directory exists
    if not os.path.exists(root_dir):
        print(f"Error: Directory '{root_dir}' does not exist!")
        return
    
    file_groups = defaultdict(list)
    pattern = re.compile(r"^(.*?)(\(\d+\))?(\.[^.]+)$")
    
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for name in filenames:
            full_path = os.path.join(dirpath, name)
            try:
                size = os.path.getsize(full_path)
                match = pattern.match(name)
                if match:
                    base, dup_tag, ext = match.groups()
                    key = (base.strip(), ext, size)
                    file_groups[key].append((full_path, bool(dup_tag)))
            except Exception as e:
                print(f"Skip {full_path}: {e}")
    
    # Deduplication process
    deleted_count = 0
    for files in file_groups.values():
        originals = [f for f, is_dup in files if not is_dup]
        if originals:
            for f, is_dup in files:
                if is_dup:
                    print(f"Deleting duplicate: {f}")
                    try:
                        os.remove(f)
                        deleted_count += 1
                    except Exception as e:
                        print(f"Failed to delete {f}: {e}")
    
    if deleted_count > 0:
        print(f"Deduplication complete! Deleted {deleted_count} duplicate files.")
    else:
        print("No duplicate files found.")

 # Usage
 target_path = "~/Downloads/"
 target_path = os.path.expanduser(target_path) if '~' in target_path else target_path
 dedup_files(target_path)
diff --git a/dedup_files_universal.py b/dedup_files_universal.py
 import os
 import re
 from collections import defaultdict

 # Define all duplicate patterns with named groups
 DUP_PATTERNS = [
    re.compile(r'^Copy of (?P<base>.+?)(?P<ext>\.[^.]+)$', re.IGNORECASE),
    re.compile(r'^(?P<base>.+?) - Copy \(\d+\)(?P<ext>\.[^.]+)$', re.IGNORECASE),
    re.compile(r'^(?P<base>.+?) - Copy(?P<ext>\.[^.]+)$', re.IGNORECASE),
    re.compile(r'^(?P<base>.+?) - 副本 \(\d+\)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) - 副本(?P<ext>\.[^.]+)$'),
    re.compile(r'^副本 (?P<base>.+?)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) - 复制 \(\d+\)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) - 复制(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) \(\d+\)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?)\(\d+\)(?P<ext>\.[^.]+)$'),
 ]

 def parse_filename(name):
    for pat in DUP_PATTERNS:
        m = pat.match(name)
        if m:
            return m.group('base').strip(), m.group('ext'), True
    base, ext = os.path.splitext(name)
    return base.strip(), ext, False

 def dedup_files(root_dir):
    """
    Remove duplicate files from a directory tree.
    
    Deduplication rules:
    - Files are grouped by: basename + extension + size
    - Supports macOS, Windows (English/Chinese) duplicate patterns
    - Only files with duplicate tags are deleted
    - Original files (without tags) are preserved
    
    Examples:
    - a.py + a(1).png -> both kept (different extensions)
    - document.pdf + "Copy of document.pdf" -> "Copy of document.pdf" deleted
    - image.jpg + "image - 副本.jpg" -> "image - 副本.jpg" deleted
    """
    if not os.path.exists(root_dir):
        print(f"Error: Directory '{root_dir}' does not exist!")
        return
    
    file_groups = defaultdict(list)
    
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for name in filenames:
            full_path = os.path.join(dirpath, name)
            try:
                size = os.path.getsize(full_path)
                base_name, ext, is_duplicate = parse_filename(name)
                key = (base_name, ext, size)
                file_groups[key].append((full_path, is_duplicate))
            except Exception as e:
                print(f"Skip {full_path}: {e}")
    
    deleted_count = 0
    for files in file_groups.values():
        originals = [f for f, is_dup in files if not is_dup]
        if originals:
            for f, is_dup in files:
                if is_dup:
                    print(f"Deleting duplicate: {f}")
                    try:
                        os.remove(f)
                        deleted_count += 1
                    except Exception as e:
                        print(f"Failed to delete {f}: {e}")
    
    if deleted_count > 0:
        print(f"Deduplication complete! Deleted {deleted_count} duplicate files.")
    else:
        print("No duplicate files found.")

 # Usage
 target_path = "~/Downloads/"
 target_path = os.path.expanduser(target_path) if '~' in target_path else target_path
 dedup_files(target_path)
	import os
	import re
	from collections import defaultdict

	def dedup_files(root_dir):
	"""
	Remove duplicate files from a directory tree.

	Deduplication rules:
	- Files are grouped by: basename + extension + size
	- Files with (n) suffix are considered duplicates
	- Only files with duplicate tags like (1), (2) are deleted
	- Original files (without tags) are preserved

	Examples:
	- file.txt (original) + file(1).txt (duplicate) -> file(1).txt deleted
	- a.py + a(1).png -> both kept (different extensions)
	"""
	# Check if root directory exists
	if not os.path.exists(root_dir):
	print(f"Error: Directory '{root_dir}' does not exist!")
	return

	file_groups = defaultdict(list)
	pattern = re.compile(r"^(.*?)(\(\d+\))?(\.[^.]+)$")

	for dirpath, dirnames, filenames in os.walk(root_dir):
	for name in filenames:
	full_path = os.path.join(dirpath, name)
	try:
	size = os.path.getsize(full_path)
	match = pattern.match(name)
	if match:
	base, dup_tag, ext = match.groups()
	key = (base.strip(), ext, size)
	file_groups[key].append((full_path, bool(dup_tag)))
	except Exception as e:
	print(f"Skip {full_path}: {e}")

	# Deduplication process
	deleted_count = 0
	for files in file_groups.values():
	originals = [f for f, is_dup in files if not is_dup]
	if originals:
	for f, is_dup in files:
	if is_dup:
	print(f"Deleting duplicate: {f}")
	try:
	os.remove(f)
	deleted_count += 1
	except Exception as e:
	print(f"Failed to delete {f}: {e}")

	if deleted_count > 0:
	print(f"Deduplication complete! Deleted {deleted_count} duplicate files.")
	else:
	print("No duplicate files found.")

	# Usage
	target_path = "~/Downloads/"
	target_path = os.path.expanduser(target_path) if '~' in target_path else target_path
	dedup_files(target_path)
No results found