Created
April 18, 2025 13:37
-
-
Save AlphaSheep/008b4378b8038e22f5a0da8bd0ab1bc2 to your computer and use it in GitHub Desktop.
Deduplicate MP3 files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env -S uv --quiet run --script | |
# /// script | |
# requires-python = ">=3.13" | |
# dependencies = [ | |
# "mutagen", | |
# ] | |
# /// | |
""" | |
Deduplicates music, assuming it has been tagged with MusicBrainz Picard. | |
Keeps the copy of the track ID with the largest file size. | |
Place this script in the folder with the music and run | |
Assuming you use uv, run it with | |
uv run music_dedup.py | |
""" | |
import logging | |
from pathlib import Path | |
from typing import Final | |
from mutagen.id3 import ID3, ID3NoHeaderError | |
logging.basicConfig(level=logging.DEBUG) | |
CURRENT_DIR: Final[Path] = Path(__file__).resolve().parent | |
FILE_COUNT_LIMIT: Final[int] = 10_000_000 | |
TAG_NAME: Final[str] = "MusicBrainz Release Track Id" | |
def get_folders(root: Path) -> list[Path]: | |
return [ | |
d | |
for d in root.glob("*") | |
if d.is_dir() and not d.name.startswith(".") | |
] | |
def get_mp3_files(root: Path) -> list[Path]: | |
files = list(root.rglob("*.mp3")) | |
files.sort() | |
return files | |
def extract_trackid(path: Path) -> str | None: | |
try: | |
tags = ID3(path) | |
except ID3NoHeaderError: | |
logging.warning(f"No ID3 header found in {path}") | |
return None | |
for frame in tags.getall("TXXX"): | |
if frame.desc.lower() == TAG_NAME.lower() and frame.text: | |
return frame.text[0] | |
logging.warning(f"No tag '{TAG_NAME}' found in {path}") | |
return None | |
def get_track_file_mapping(folder: Path) -> dict[str, list[Path]]: | |
track_file_map: dict[str, list[Path]] = {} | |
count = 0 | |
for file in get_mp3_files(folder): | |
logging.info(f"Processing file {file}") | |
track_id = extract_trackid(file) | |
if track_id is None: | |
continue | |
if track_id not in track_file_map: | |
track_file_map[track_id] = [] | |
track_file_map[track_id].append(file) | |
count += 1 | |
if count >= FILE_COUNT_LIMIT: | |
break | |
return track_file_map | |
def dedup(track_file_map: dict[str, list[Path]]) -> None: | |
for track_id, files in track_file_map.items(): | |
if len(files) <= 1: | |
continue | |
logging.info(f"Checking track {track_id}: {files[0]}") | |
largest = max(files, key=lambda f: f.stat().st_size) | |
logging.info(f"--- KEEP {largest} ({largest.stat().st_size} bytes)") | |
to_delete = [f for f in files if f != largest] | |
for f in to_delete: | |
logging.info(f"!!! DELETE {f} ({f.stat().st_size} bytes)") | |
f.unlink(missing_ok=True) | |
def main() -> None: | |
logging.info("Building track list\n") | |
track_file_map = get_track_file_mapping(CURRENT_DIR) | |
logging.info("Deduplicating tracks\n") | |
dedup(track_file_map) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment