Created
July 24, 2025 14:54
-
-
Save kenilt/972a4207fc8a3b3ac392c2c8070f4f21 to your computer and use it in GitHub Desktop.
Detects and moves duplicate files to a separate `Duplicated` folder. It chooses the best copy to keep based on filename and folder structure.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
import re | |
from collections import defaultdict | |
from datetime import datetime | |
# === CONFIG === | |
scan_folder = "path/to/target/folder" # ← change this to your target folder | |
duplicate_target = os.path.join(os.getcwd(), "Duplicated") | |
# Regex to extract date from filename | |
filename_date_pattern = re.compile(r"(\d{4})(\d{2})(\d{2})") | |
# Regex to extract folder date (YYYY_MM_DD or YYYY_MM) | |
folder_date_pattern = re.compile(r"(\d{4})_(\d{2})(?:_(\d{2}))?") | |
file_map = defaultdict(list) | |
# === Step 1: Group files by name === | |
for root, _, files in os.walk(scan_folder): | |
for file in files: | |
if file.startswith("._"): | |
continue # Skip macOS metadata | |
full_path = os.path.join(root, file) | |
file_map[file].append(full_path) | |
# === Step 2: Apply matching logic === | |
for file_name, paths in file_map.items(): | |
if len(paths) < 2: | |
continue | |
# Extract date from filename | |
file_date_match = filename_date_pattern.search(file_name) | |
file_yyyy, file_mm, file_dd = None, None, None | |
if file_date_match: | |
file_yyyy, file_mm, file_dd = file_date_match.groups() | |
scored_paths = [] | |
for path in paths: | |
folder = os.path.basename(os.path.dirname(path)) | |
folder_match = folder_date_pattern.search(folder) | |
score = 0 | |
if folder_match: | |
folder_yyyy, folder_mm, folder_dd = folder_match.groups() | |
if (file_yyyy, file_mm, file_dd) == (folder_yyyy, folder_mm, folder_dd): | |
score = 3 # Exact YYYY_MM_DD match | |
elif (file_yyyy, file_mm) == (folder_yyyy, folder_mm): | |
score = 2 # Partial YYYY_MM match | |
if score == 0: | |
# Fallback: longer folder name | |
score = 1 + len(folder) / 1000.0 | |
scored_paths.append((score, path)) | |
# Sort by score descending (higher = better match) | |
scored_paths.sort(reverse=True) | |
keep_path = scored_paths[0][1] | |
to_move = [p for _, p in scored_paths[1:]] | |
print(f"[KEEP] {keep_path}") | |
for src_path in to_move: | |
rel_path = os.path.relpath(src_path, scan_folder) | |
dest_path = os.path.join(duplicate_target, rel_path) | |
os.makedirs(os.path.dirname(dest_path), exist_ok=True) | |
shutil.move(src_path, dest_path) | |
print(f"Moved duplicate: {src_path} → {dest_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment