Skip to content

Instantly share code, notes, and snippets.

@markwatson
Last active June 10, 2026 03:24
Show Gist options
  • Select an option

  • Save markwatson/edb2175b11044db0a1efe7bf5e855353 to your computer and use it in GitHub Desktop.

Select an option

Save markwatson/edb2175b11044db0a1efe7bf5e855353 to your computer and use it in GitHub Desktop.
Fetch missing album cover art for a music library
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
"""grab_album_covers — fetch missing album cover art for a music library.
BACKGROUND
----------
Navidrome resolves album art in this order (CoverArtPriority, default):
1. a cover.* / folder.* / front.* image file in the album directory
2. an image embedded in ANY of the album's media files
3. external services (disabled by default)
So an album only shows blank in Navidrome when it has neither a cover
file nor embedded art in any track.
If you rip CDs with whipper, its --cover-art flag fetches from the Cover
Art Archive (CAA). When a MusicBrainz release has no CAA image — common
for live/bootleg/classical/obscure releases — whipper silently saves
nothing, which is how the gaps appeared in this library in the first place.
WHAT THIS DOES
--------------
For every directory under LIBRARY that contains audio files but no cover
image (file or embedded), try these sources in order and save the first
hit as cover.jpg:
0. a sibling "(Disc N of M)" directory that already has a cover
(multi-disc sets share one sleeve; no network needed)
1. Cover Art Archive by MUSICBRAINZ_ALBUMID from the file tags
2. Cover Art Archive by MUSICBRAINZ_RELEASEGROUPID (covers the common
case where *some* edition of the album has art, just not ours)
3. Deezer album search by artist + album tags (1000x1000 images)
4. iTunes album search by artist + album tags (600x600 images)
5. Discogs, via the release's Discogs URL relationship in MusicBrainz
(this is how we found the Dylan/Young "Live On Air 1988" bootleg)
WHERE IT CAN GO WRONG
---------------------
* Sources 3-5 are FUZZY. Deezer/iTunes results are matched by normalized
title comparison, which can still pick a remaster/deluxe/wrong edition
with different art — and conversely REJECTS legitimate hits whose titles
differ (e.g. Deezer lists "Chopin: Piano Concertos 1 & 2" where our tag
says "Piano Concertos 1 & 2", which pushed that album down to Discogs).
Anything fetched from a fuzzy source is logged with "VERIFY:" — eyeball
those covers afterwards.
* Files with no MusicBrainz tags (anything not tagged by whipper/Picard)
skip straight to the fuzzy searches, with artist/album read from plain
tags. Untagged files can't be searched at all.
* Discogs: unauthenticated API is rate-limited (~25 req/min) and image
URLs occasionally require auth; failures here are non-fatal.
* The embedded-art check matches Navidrome ("any track has art"), so an
album where only track 1 has embedded art is considered fine — that is
what Navidrome will display.
* Only looks at directories whose audio sits directly in them; exotic
layouts (loose files in artist root, Artist/Album/CD1/) need a manual
pass.
* After running, Navidrome picks changes up on its next scan; if a cover
still looks blank, restart Navidrome — it caches resized images and the
cache can go stale (github.com/navidrome/navidrome/issues/2692).
USAGE
grab_album_covers [-n] [LIBRARY_DIR]
-n / --dry-run report what would be fetched, download nothing
--test run the built-in unit tests instead (see TESTS below)
LIBRARY_DIR defaults to ~/Music/shared/organized (this library's layout —
point it anywhere).
Exit code is 0 if every missing cover was fetched, 1 if any album still
needs art (so you can chain it in scripts).
TESTS
grab_album_covers --test # whole suite
grab_album_covers --test TestNorm -v # extra args go to unittest
Tests live at the bottom of this file and are offline-safe: every network
call is mocked. (--test instead of `python -m unittest <file>` because
this script has no .py extension, so unittest can't import it by path.)
Requires: python3 3.10+ (stdlib only) and ffprobe, which ships with ffmpeg:
dnf install ffmpeg / apt install ffmpeg / brew install ffmpeg
License: MIT
"""
import argparse
import json
import re
import shutil
import subprocess
import sys
import tempfile
import time
import unittest
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
from unittest import mock
# tempfile/unittest/mock are only used by the test suite at the bottom of
# this file; they're imported here anyway because linters (rightly) dislike
# mid-file imports, and they're all stdlib.
AUDIO_EXTS = {".flac", ".mp3", ".m4a", ".ogg", ".opus", ".wav"}
COVER_RE = re.compile(r"^(cover|folder|front|albumart)\.(jpe?g|png|webp)$", re.I)
DISC_RE = re.compile(r" \(Disc \d+ of \d+\).*$")
USER_AGENT = "grab_album_covers/2.0 (personal music library tool)"
TIMEOUT = 60
def audio_files(d: Path) -> list[Path]:
return sorted(
f for f in d.iterdir() if f.is_file() and f.suffix.lower() in AUDIO_EXTS
)
def ffprobe_json(path: Path, *args: str) -> dict:
"""Run ffprobe and return its JSON output ({} on any failure)."""
cmd = ["ffprobe", "-v", "error", "-of", "json", *args, str(path)]
try:
out = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
return json.loads(out.stdout) if out.returncode == 0 else {}
except (subprocess.TimeoutExpired, json.JSONDecodeError, OSError):
return {}
def tags(path: Path) -> dict[str, str]:
"""Tag dict with uppercased keys. ffprobe normalizes vorbis/id3/mp4 tag
names enough that MUSICBRAINZ_ALBUMID etc. come through on all formats
we care about (whipper writes vorbis comments on FLAC)."""
fmt = ffprobe_json(path, "-show_entries", "format_tags").get("format", {})
return {k.upper(): v for k, v in fmt.get("tags", {}).items()}
def has_embedded_art(d: Path) -> bool:
"""True if ANY track has an attached picture — matches how Navidrome
falls back ('an embedded image from one of the mediafiles')."""
for f in audio_files(d):
streams = ffprobe_json(
f, "-select_streams", "v",
"-show_entries", "stream_disposition=attached_pic",
).get("streams", [])
if any(s.get("disposition", {}).get("attached_pic") for s in streams):
return True
return False
def has_cover_file(d: Path) -> bool:
return any(COVER_RE.match(f.name) for f in d.iterdir() if f.is_file())
def http_get(url: str) -> bytes | None:
"""GET with our User-Agent (MusicBrainz requires one; Discogs throttles
blank ones). Returns body bytes, or None on any HTTP/network error."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
return resp.read()
except (urllib.error.URLError, TimeoutError, OSError):
return None
def get_json(url: str) -> dict:
body = http_get(url)
if body is None:
return {}
try:
return json.loads(body)
except json.JSONDecodeError:
return {}
def looks_like_image(data: bytes) -> bool:
"""Magic-byte check (imghdr was removed from the stdlib in 3.13).
Guards against 404 pages / JSON error bodies saved as cover.jpg."""
return (
data.startswith(b"\xff\xd8\xff") # JPEG
or data.startswith(b"\x89PNG\r\n\x1a\n") # PNG
or (data[:4] == b"RIFF" and data[8:12] == b"WEBP") # WebP
)
def save_cover(url: str, d: Path) -> bool:
data = http_get(url)
if not data or not looks_like_image(data):
return False
(d / "cover.jpg").write_bytes(data)
return True
def norm(title: str) -> str:
"""Normalize for fuzzy comparison: lowercase, drop '(Deluxe Edition)'
style qualifiers, keep alphanumerics only."""
title = re.sub(r"[\(\[].*?[\)\]]", "", title.lower())
return re.sub(r"[^a-z0-9]+", "", title)
# --- per-source fetchers (return a log line on success, None on miss) -------
def try_sibling_disc(d: Path) -> str | None:
"""Multi-disc sets: whipper writes one dir per disc but the art usually
only lands in whichever disc matched a CAA image. Copy from a sibling."""
base = DISC_RE.sub("", d.name)
if base == d.name:
return None # not a multi-disc dir
for sib in d.parent.iterdir():
# Same base + its own "(Disc N of M)" marker — a plain startswith()
# here would wrongly match e.g. "Album II (Disc 1 of 2)".
if (sib != d and sib.is_dir()
and DISC_RE.search(sib.name) and DISC_RE.sub("", sib.name) == base
and (sib / "cover.jpg").is_file()):
shutil.copyfile(sib / "cover.jpg", d / "cover.jpg")
return f"OK (sibling disc): {sib}"
return None
def try_caa(d: Path, t: dict) -> str | None:
"""Cover Art Archive — authoritative when present, keyed by the
MusicBrainz IDs whipper/Picard write into the tags. /front redirects
to the image flagged as the front cover."""
mbid = t.get("MUSICBRAINZ_ALBUMID")
rgid = t.get("MUSICBRAINZ_RELEASEGROUPID")
caa = "https://coverartarchive.org"
if mbid and save_cover(f"{caa}/release/{mbid}/front", d):
return f"OK (CAA release): {mbid}"
if rgid and save_cover(f"{caa}/release-group/{rgid}/front", d):
return f"OK (CAA release-group): {rgid}"
return None
def try_deezer(d: Path, artist: str, album: str) -> str | None:
"""Deezer — good image quality (1000x1000), decent catalog, no auth."""
q = urllib.parse.urlencode({"q": f"{artist} {album}"})
for hit in get_json(f"https://api.deezer.com/search/album?{q}").get("data", []):
if norm(hit.get("title", "")) == norm(album) and hit.get("cover_xl"):
if save_cover(hit["cover_xl"], d):
return f"VERIFY (Deezer, fuzzy match): {hit['cover_xl']}"
return None
def try_itunes(d: Path, artist: str, album: str) -> str | None:
"""iTunes — huge catalog; artworkUrl100 is a thumbnail but the CDN
serves larger sizes if you rewrite the dimension suffix."""
q = urllib.parse.urlencode(
{"term": f"{artist} {album}", "entity": "album", "limit": 10}
)
for hit in get_json(f"https://itunes.apple.com/search?{q}").get("results", []):
if norm(hit.get("collectionName", "")) == norm(album) \
and hit.get("artworkUrl100"):
url = hit["artworkUrl100"].replace("100x100bb", "600x600bb")
if save_cover(url, d):
return f"VERIFY (iTunes, fuzzy match): {url}"
return None
def try_discogs_via_mb(d: Path, t: dict) -> str | None:
"""Discogs, via MusicBrainz URL relationships. Catches bootlegs and odd
editions that exist in MB with a Discogs link but no CAA image."""
mbid = t.get("MUSICBRAINZ_ALBUMID")
if not mbid:
return None
rels = get_json(
f"https://musicbrainz.org/ws/2/release/{mbid}?inc=url-rels&fmt=json"
).get("relations", [])
for rel in rels:
url = rel.get("url", {}).get("resource", "")
m = re.search(r"discogs\.com/release/(\d+)", url)
if not m:
continue
time.sleep(2) # be polite: MB asks <=1 req/sec, Discogs ~25/min unauth
release = get_json(f"https://api.discogs.com/releases/{m.group(1)}")
imgs = release.get("images", [])
primary = [i for i in imgs if i.get("type") == "primary"] or imgs
if primary and primary[0].get("uri") and save_cover(primary[0]["uri"], d):
return f"VERIFY (Discogs via MusicBrainz): {primary[0]['uri']}"
return None
# --- main --------------------------------------------------------------------
def album_dirs(library: Path):
"""Every directory that directly contains audio files, sorted."""
seen = set()
for f in sorted(library.rglob("*")):
if f.is_file() and f.suffix.lower() in AUDIO_EXTS and f.parent not in seen:
seen.add(f.parent)
yield f.parent
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__.split("\n")[0])
ap.add_argument("-n", "--dry-run", action="store_true",
help="report what would be fetched, download nothing")
ap.add_argument("library", nargs="?", type=Path,
default=Path.home() / "Music/shared/organized")
opts = ap.parse_args()
if shutil.which("ffprobe") is None:
sys.exit("ffprobe not found — it ships with ffmpeg "
"(dnf/apt/brew install ffmpeg)")
found = fetched = failed = 0
for d in album_dirs(opts.library):
if has_cover_file(d) or has_embedded_art(d):
continue
found += 1
print(f"MISSING ART: {d}")
if opts.dry_run:
continue
t = tags(audio_files(d)[0])
artist = t.get("ALBUM_ARTIST") or t.get("ALBUMARTIST") or t.get("ARTIST", "")
album = t.get("ALBUM", "")
result = (
try_sibling_disc(d)
or try_caa(d, t)
or (album and try_deezer(d, artist, album))
or (album and try_itunes(d, artist, album))
or try_discogs_via_mb(d, t)
)
if result:
fetched += 1
print(f" {result}")
else:
failed += 1
print(" FAILED: no source had art — find one manually"
" and save it as cover.jpg")
time.sleep(1) # spread requests out across albums
print(f"\nAlbums missing art: {found}, fetched: {fetched}, failed: {failed}")
if fetched:
print("Review any VERIFY lines above, then rescan Navidrome.")
print("(If a cover still shows blank after rescanning, restart Navidrome"
" to clear its image cache.)")
return 1 if failed else 0
# ---------------------------------------------------------------------------
# TESTS — run with: grab_album_covers --test
# ---------------------------------------------------------------------------
# Offline-safe: http_get/get_json are mocked, so nothing here touches the
# network. ffprobe is only exercised on its failure path (missing file).
# Mocks patch this module's globals via MOD, which works whether the file is
# executed as __main__ or imported under another name.
MOD = sys.modules[__name__]
# Minimal byte strings that satisfy looks_like_image(); only the magic
# numbers matter, the rest is padding.
JPEG = b"\xff\xd8\xff\xe0" + b"\x00" * 16
PNG = b"\x89PNG\r\n\x1a\n" + b"\x00" * 16
WEBP = b"RIFF\x00\x00\x00\x00WEBP" + b"\x00" * 16
class TestNorm(unittest.TestCase):
def test_case_and_punctuation_ignored(self):
self.assertEqual(norm("Kid A"), norm("KID-A!"))
def test_edition_qualifiers_stripped(self):
self.assertEqual(norm("Rumours Live (Deluxe Edition)"), norm("Rumours Live"))
self.assertEqual(norm("Vulnicura [2015 Remaster]"), norm("Vulnicura"))
def test_known_limitation_prefixed_titles_do_not_match(self):
# Documents the Chopin case we hit live: Deezer titles the album
# "Chopin: Piano Concertos 1 & 2" while our tag says
# "Piano Concertos 1 & 2". These intentionally do NOT match —
# strict matching beats grabbing the wrong edition's art — so the
# album falls through to the next source (Discogs found it).
self.assertNotEqual(norm("Chopin: Piano Concertos 1 & 2"),
norm("Piano Concertos 1 & 2"))
class TestLooksLikeImage(unittest.TestCase):
def test_accepts_jpeg_png_webp(self):
for data in (JPEG, PNG, WEBP):
self.assertTrue(looks_like_image(data))
def test_rejects_error_bodies(self):
# CAA 404s return JSON, some CDNs return HTML error pages — the
# exact garbage this check exists to keep out of cover.jpg.
bodies = (b'{"error": "Not Found"}', b"<html>404</html>", b"", b"RIFF1234NOPE")
for data in bodies:
self.assertFalse(looks_like_image(data))
class TestCoverFileDetection(unittest.TestCase):
def test_recognized_and_ignored_names(self):
with tempfile.TemporaryDirectory() as td:
d = Path(td)
for name in ("back.jpg", "cover.gif", "mycover.jpg", "cover.jpg.bak"):
(d / name).write_bytes(b"x")
self.assertFalse(has_cover_file(d))
(d / "Folder.PNG").write_bytes(b"x") # any case, any of the 4 stems
self.assertTrue(has_cover_file(d))
class TestAlbumDirs(unittest.TestCase):
def test_finds_dirs_directly_containing_audio(self):
with tempfile.TemporaryDirectory() as td:
lib = Path(td)
(lib / "Artist/Album").mkdir(parents=True)
(lib / "Artist/Album/01. Song.flac").write_bytes(b"x")
(lib / "Deep/Nested/Disc1").mkdir(parents=True)
(lib / "Deep/Nested/Disc1/track.mp3").write_bytes(b"x")
(lib / "Artist/notes.txt").write_bytes(b"x") # non-audio: ignored
self.assertEqual(
set(album_dirs(lib)),
{lib / "Artist/Album", lib / "Deep/Nested/Disc1"},
)
class TestSiblingDisc(unittest.TestCase):
def test_copies_cover_from_sibling(self):
with tempfile.TemporaryDirectory() as td:
d1 = Path(td) / "Rumours Live (Disc 1 of 2)"
d2 = Path(td) / "Rumours Live (Disc 2 of 2)"
d1.mkdir()
d2.mkdir()
(d2 / "cover.jpg").write_bytes(JPEG)
self.assertIn("sibling", try_sibling_disc(d1))
self.assertEqual((d1 / "cover.jpg").read_bytes(), JPEG)
def test_handles_subtitled_disc_names(self):
# Real case from the library: whipper appends a per-disc subtitle
# after the "(Disc N of M)" marker.
with tempfile.TemporaryDirectory() as td:
d1 = Path(td) / "Concentration (Disc 1 of 2): Air"
d2 = Path(td) / "Concentration (Disc 2 of 2): Liquid"
d1.mkdir()
d2.mkdir()
(d2 / "cover.jpg").write_bytes(JPEG)
self.assertIsNotNone(try_sibling_disc(d1))
def test_ignores_different_album_with_shared_prefix(self):
# "Album II (Disc 1 of 2)" starts with "Album" but is a different
# release — its art must not be copied onto "Album (Disc 1 of 2)".
with tempfile.TemporaryDirectory() as td:
d1 = Path(td) / "Album (Disc 1 of 2)"
other = Path(td) / "Album II (Disc 1 of 2)"
d1.mkdir()
other.mkdir()
(other / "cover.jpg").write_bytes(JPEG)
self.assertIsNone(try_sibling_disc(d1))
def test_non_disc_dir_and_coverless_sibling_miss(self):
with tempfile.TemporaryDirectory() as td:
plain = Path(td) / "Just An Album"
plain.mkdir()
self.assertIsNone(try_sibling_disc(plain))
d1 = Path(td) / "Set (Disc 1 of 2)"
d2 = Path(td) / "Set (Disc 2 of 2)"
d1.mkdir()
d2.mkdir() # sibling exists but has no cover
self.assertIsNone(try_sibling_disc(d1))
class TestSaveCover(unittest.TestCase):
def test_writes_valid_image(self):
with tempfile.TemporaryDirectory() as td, \
mock.patch.object(MOD, "http_get", return_value=JPEG):
self.assertTrue(save_cover("https://example/front", Path(td)))
self.assertEqual((Path(td) / "cover.jpg").read_bytes(), JPEG)
def test_rejects_non_image_and_network_failure(self):
with tempfile.TemporaryDirectory() as td:
for body in (b'{"error":"Not Found"}', None):
with mock.patch.object(MOD, "http_get", return_value=body):
self.assertFalse(save_cover("https://example/front", Path(td)))
self.assertFalse((Path(td) / "cover.jpg").exists())
class TestTryCaa(unittest.TestCase):
def test_falls_back_from_release_to_release_group(self):
# Exactly the live behavior we saw: per-release art 404s (None),
# release-group art exists. Both IDs must be tried, in that order.
tags = {"MUSICBRAINZ_ALBUMID": "rel-id", "MUSICBRAINZ_RELEASEGROUPID": "rg-id"}
with tempfile.TemporaryDirectory() as td, \
mock.patch.object(MOD, "http_get", side_effect=[None, JPEG]) as m:
result = try_caa(Path(td), tags)
self.assertIn("release-group", result)
self.assertEqual(
[c.args[0] for c in m.call_args_list],
["https://coverartarchive.org/release/rel-id/front",
"https://coverartarchive.org/release-group/rg-id/front"],
)
def test_no_mb_tags_means_no_requests(self):
with tempfile.TemporaryDirectory() as td, \
mock.patch.object(MOD, "http_get") as m:
self.assertIsNone(try_caa(Path(td), {}))
m.assert_not_called()
class TestTryDeezer(unittest.TestCase):
def test_skips_wrong_title_picks_exact_match(self):
hits = {"data": [
{"title": "Kid A Mnesia", "cover_xl": "https://img/wrong"},
{"title": "Kid A", "cover_xl": "https://img/right"},
]}
with tempfile.TemporaryDirectory() as td, \
mock.patch.object(MOD, "get_json", return_value=hits), \
mock.patch.object(MOD, "http_get", return_value=JPEG) as m:
result = try_deezer(Path(td), "Radiohead", "Kid A")
self.assertIn("VERIFY", result) # fuzzy sources must demand eyeballs
m.assert_called_once_with("https://img/right")
def test_no_match_returns_none(self):
hits = {"data": [{"title": "Completely Different", "cover_xl": "https://img/x"}]}
with tempfile.TemporaryDirectory() as td, \
mock.patch.object(MOD, "get_json", return_value=hits):
self.assertIsNone(try_deezer(Path(td), "Radiohead", "Kid A"))
class TestTryItunes(unittest.TestCase):
def test_upscales_thumbnail_url(self):
hits = {"results": [{"collectionName": "Weed Garden",
"artworkUrl100": "https://img/100x100bb.jpg"}]}
with tempfile.TemporaryDirectory() as td, \
mock.patch.object(MOD, "get_json", return_value=hits), \
mock.patch.object(MOD, "http_get", return_value=JPEG) as m:
self.assertIn("VERIFY", try_itunes(Path(td), "Iron & Wine", "Weed Garden"))
m.assert_called_once_with("https://img/600x600bb.jpg")
class TestTryDiscogsViaMb(unittest.TestCase):
def test_follows_mb_relation_prefers_primary_image(self):
mb = {"relations": [
{"url": {"resource": "https://example.com/not-discogs"}},
{"url": {"resource": "https://www.discogs.com/release/33891879"}},
]}
discogs = {"images": [
{"type": "secondary", "uri": "https://i.discogs/back"},
{"type": "primary", "uri": "https://i.discogs/front"},
]}
with tempfile.TemporaryDirectory() as td, \
mock.patch.object(MOD, "get_json", side_effect=[mb, discogs]), \
mock.patch.object(MOD, "http_get", return_value=JPEG) as m, \
mock.patch.object(MOD.time, "sleep"): # skip the politeness delay
result = try_discogs_via_mb(Path(td), {"MUSICBRAINZ_ALBUMID": "x"})
self.assertIn("Discogs", result)
m.assert_called_once_with("https://i.discogs/front")
class TestFfprobeFailurePath(unittest.TestCase):
@unittest.skipUnless(shutil.which("ffprobe"), "ffprobe not installed")
def test_missing_file_yields_empty_dict(self):
self.assertEqual(ffprobe_json(Path("/nonexistent/no.flac")), {})
if __name__ == "__main__":
if "--test" in sys.argv:
sys.argv.remove("--test")
unittest.main(verbosity=2)
else:
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment