Skip to content

Instantly share code, notes, and snippets.

@AlphaSheep
Created April 18, 2025 13:37
Show Gist options
  • Save AlphaSheep/008b4378b8038e22f5a0da8bd0ab1bc2 to your computer and use it in GitHub Desktop.
Save AlphaSheep/008b4378b8038e22f5a0da8bd0ab1bc2 to your computer and use it in GitHub Desktop.
Deduplicate MP3 files
#!/usr/bin/env -S uv --quiet run --script
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "mutagen",
# ]
# ///
"""
Deduplicates music, assuming it has been tagged with MusicBrainz Picard.
Keeps the copy of the track ID with the largest file size.
Place this script in the folder with the music and run
Assuming you use uv, run it with
uv run music_dedup.py
"""
import logging
from pathlib import Path
from typing import Final
from mutagen.id3 import ID3, ID3NoHeaderError
logging.basicConfig(level=logging.DEBUG)
CURRENT_DIR: Final[Path] = Path(__file__).resolve().parent
FILE_COUNT_LIMIT: Final[int] = 10_000_000
TAG_NAME: Final[str] = "MusicBrainz Release Track Id"
def get_folders(root: Path) -> list[Path]:
return [
d
for d in root.glob("*")
if d.is_dir() and not d.name.startswith(".")
]
def get_mp3_files(root: Path) -> list[Path]:
files = list(root.rglob("*.mp3"))
files.sort()
return files
def extract_trackid(path: Path) -> str | None:
try:
tags = ID3(path)
except ID3NoHeaderError:
logging.warning(f"No ID3 header found in {path}")
return None
for frame in tags.getall("TXXX"):
if frame.desc.lower() == TAG_NAME.lower() and frame.text:
return frame.text[0]
logging.warning(f"No tag '{TAG_NAME}' found in {path}")
return None
def get_track_file_mapping(folder: Path) -> dict[str, list[Path]]:
track_file_map: dict[str, list[Path]] = {}
count = 0
for file in get_mp3_files(folder):
logging.info(f"Processing file {file}")
track_id = extract_trackid(file)
if track_id is None:
continue
if track_id not in track_file_map:
track_file_map[track_id] = []
track_file_map[track_id].append(file)
count += 1
if count >= FILE_COUNT_LIMIT:
break
return track_file_map
def dedup(track_file_map: dict[str, list[Path]]) -> None:
for track_id, files in track_file_map.items():
if len(files) <= 1:
continue
logging.info(f"Checking track {track_id}: {files[0]}")
largest = max(files, key=lambda f: f.stat().st_size)
logging.info(f"--- KEEP {largest} ({largest.stat().st_size} bytes)")
to_delete = [f for f in files if f != largest]
for f in to_delete:
logging.info(f"!!! DELETE {f} ({f.stat().st_size} bytes)")
f.unlink(missing_ok=True)
def main() -> None:
logging.info("Building track list\n")
track_file_map = get_track_file_mapping(CURRENT_DIR)
logging.info("Deduplicating tracks\n")
dedup(track_file_map)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment