Skip to content

Instantly share code, notes, and snippets.

@kenilt
Created July 24, 2025 14:54
Show Gist options
  • Save kenilt/972a4207fc8a3b3ac392c2c8070f4f21 to your computer and use it in GitHub Desktop.
Save kenilt/972a4207fc8a3b3ac392c2c8070f4f21 to your computer and use it in GitHub Desktop.
Detects and moves duplicate files to a separate `Duplicated` folder. It chooses the best copy to keep based on filename and folder structure.
import os
import shutil
import re
from collections import defaultdict
from datetime import datetime
# === CONFIG ===
scan_folder = "path/to/target/folder" # ← change this to your target folder
duplicate_target = os.path.join(os.getcwd(), "Duplicated")
# Regex to extract date from filename
filename_date_pattern = re.compile(r"(\d{4})(\d{2})(\d{2})")
# Regex to extract folder date (YYYY_MM_DD or YYYY_MM)
folder_date_pattern = re.compile(r"(\d{4})_(\d{2})(?:_(\d{2}))?")
file_map = defaultdict(list)
# === Step 1: Group files by name ===
for root, _, files in os.walk(scan_folder):
for file in files:
if file.startswith("._"):
continue # Skip macOS metadata
full_path = os.path.join(root, file)
file_map[file].append(full_path)
# === Step 2: Apply matching logic ===
for file_name, paths in file_map.items():
if len(paths) < 2:
continue
# Extract date from filename
file_date_match = filename_date_pattern.search(file_name)
file_yyyy, file_mm, file_dd = None, None, None
if file_date_match:
file_yyyy, file_mm, file_dd = file_date_match.groups()
scored_paths = []
for path in paths:
folder = os.path.basename(os.path.dirname(path))
folder_match = folder_date_pattern.search(folder)
score = 0
if folder_match:
folder_yyyy, folder_mm, folder_dd = folder_match.groups()
if (file_yyyy, file_mm, file_dd) == (folder_yyyy, folder_mm, folder_dd):
score = 3 # Exact YYYY_MM_DD match
elif (file_yyyy, file_mm) == (folder_yyyy, folder_mm):
score = 2 # Partial YYYY_MM match
if score == 0:
# Fallback: longer folder name
score = 1 + len(folder) / 1000.0
scored_paths.append((score, path))
# Sort by score descending (higher = better match)
scored_paths.sort(reverse=True)
keep_path = scored_paths[0][1]
to_move = [p for _, p in scored_paths[1:]]
print(f"[KEEP] {keep_path}")
for src_path in to_move:
rel_path = os.path.relpath(src_path, scan_folder)
dest_path = os.path.join(duplicate_target, rel_path)
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
shutil.move(src_path, dest_path)
print(f"Moved duplicate: {src_path} → {dest_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment