AlphaSheep · April 18, 2025 13:37
diff --git a/music_dedup.py b/music_dedup.py
 #!/usr/bin/env -S uv --quiet run --script
 # /// script
 # requires-python = ">=3.13"
 # dependencies = [
 #     "mutagen",
 # ]
 # ///

 """
 Deduplicates music, assuming it has been tagged with MusicBrainz Picard.
 Keeps the copy of the track ID with the largest file size.
 Place this script in the folder with the music and run
 Assuming you use uv, run it with
    uv run music_dedup.py
 """

 import logging
 from pathlib import Path
 from typing import Final
 from mutagen.id3 import ID3, ID3NoHeaderError


 logging.basicConfig(level=logging.DEBUG)


 CURRENT_DIR: Final[Path] = Path(__file__).resolve().parent

 FILE_COUNT_LIMIT: Final[int] = 10_000_000

 TAG_NAME: Final[str] = "MusicBrainz Release Track Id"


 def get_folders(root: Path) -> list[Path]:
    return [
        d
        for d in root.glob("*")
        if d.is_dir() and not d.name.startswith(".")
    ]


 def get_mp3_files(root: Path) -> list[Path]:
    files = list(root.rglob("*.mp3"))
    files.sort()
    return files


 def extract_trackid(path: Path) -> str | None:
    try:
        tags = ID3(path)
    except ID3NoHeaderError:
        logging.warning(f"No ID3 header found in {path}")
        return None

    for frame in tags.getall("TXXX"):
        if frame.desc.lower() == TAG_NAME.lower() and frame.text:
            return frame.text[0]

    logging.warning(f"No tag '{TAG_NAME}' found in {path}")
    return None


 def get_track_file_mapping(folder: Path) -> dict[str, list[Path]]:
    track_file_map: dict[str, list[Path]] = {}
    count = 0
    for file in get_mp3_files(folder):
        logging.info(f"Processing file {file}")
        track_id = extract_trackid(file)

        if track_id is None:
            continue

        if track_id not in track_file_map:
            track_file_map[track_id] = []
        track_file_map[track_id].append(file)

        count += 1
        if count >= FILE_COUNT_LIMIT:
            break

    return track_file_map


 def dedup(track_file_map: dict[str, list[Path]]) -> None:
    for track_id, files in track_file_map.items():
        if len(files) <= 1:
            continue

        logging.info(f"Checking track {track_id}: {files[0]}")
        largest = max(files, key=lambda f: f.stat().st_size)
        logging.info(f"--- KEEP   {largest} ({largest.stat().st_size} bytes)")

        to_delete = [f for f in files if f != largest]

        for f in to_delete:
            logging.info(f"!!! DELETE {f} ({f.stat().st_size} bytes)")
            f.unlink(missing_ok=True)


 def main() -> None:
    logging.info("Building track list\n")
    track_file_map = get_track_file_mapping(CURRENT_DIR)

    logging.info("Deduplicating tracks\n")
    dedup(track_file_map)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env -S uv --quiet run --script
	# /// script
	# requires-python = ">=3.13"
	# dependencies = [
	# "mutagen",
	# ]
	# ///

	"""
	Deduplicates music, assuming it has been tagged with MusicBrainz Picard.
	Keeps the copy of the track ID with the largest file size.
	Place this script in the folder with the music and run
	Assuming you use uv, run it with
	uv run music_dedup.py
	"""

	import logging
	from pathlib import Path
	from typing import Final
	from mutagen.id3 import ID3, ID3NoHeaderError


	logging.basicConfig(level=logging.DEBUG)


	CURRENT_DIR: Final[Path] = Path(__file__).resolve().parent

	FILE_COUNT_LIMIT: Final[int] = 10_000_000

	TAG_NAME: Final[str] = "MusicBrainz Release Track Id"


	def get_folders(root: Path) -> list[Path]:
	return [
	d
	for d in root.glob("*")
	if d.is_dir() and not d.name.startswith(".")
	]


	def get_mp3_files(root: Path) -> list[Path]:
	files = list(root.rglob("*.mp3"))
	files.sort()
	return files


	def extract_trackid(path: Path) -> str \| None:
	try:
	tags = ID3(path)
	except ID3NoHeaderError:
	logging.warning(f"No ID3 header found in {path}")
	return None

	for frame in tags.getall("TXXX"):
	if frame.desc.lower() == TAG_NAME.lower() and frame.text:
	return frame.text[0]

	logging.warning(f"No tag '{TAG_NAME}' found in {path}")
	return None


	def get_track_file_mapping(folder: Path) -> dict[str, list[Path]]:
	track_file_map: dict[str, list[Path]] = {}
	count = 0
	for file in get_mp3_files(folder):
	logging.info(f"Processing file {file}")
	track_id = extract_trackid(file)

	if track_id is None:
	continue

	if track_id not in track_file_map:
	track_file_map[track_id] = []
	track_file_map[track_id].append(file)

	count += 1
	if count >= FILE_COUNT_LIMIT:
	break

	return track_file_map


	def dedup(track_file_map: dict[str, list[Path]]) -> None:
	for track_id, files in track_file_map.items():
	if len(files) <= 1:
	continue

	logging.info(f"Checking track {track_id}: {files[0]}")
	largest = max(files, key=lambda f: f.stat().st_size)
	logging.info(f"--- KEEP {largest} ({largest.stat().st_size} bytes)")

	to_delete = [f for f in files if f != largest]

	for f in to_delete:
	logging.info(f"!!! DELETE {f} ({f.stat().st_size} bytes)")
	f.unlink(missing_ok=True)


	def main() -> None:
	logging.info("Building track list\n")
	track_file_map = get_track_file_mapping(CURRENT_DIR)

	logging.info("Deduplicating tracks\n")
	dedup(track_file_map)


	if __name__ == "__main__":
	main()