Skip to content

Instantly share code, notes, and snippets.

@majora2007
Created May 30, 2026 13:31
Show Gist options
  • Select an option

  • Save majora2007/3bf9bd9417b36a7e7cd4cd11cfb5906e to your computer and use it in GitHub Desktop.

Select an option

Save majora2007/3bf9bd9417b36a7e7cd4cd11cfb5906e to your computer and use it in GitHub Desktop.
Convert Image-based Epubs -> CBZs with Metadata
#!/usr/bin/env python3
"""
epub2cbz - Convert an EPUB (2 or 3) to a CBZ.
Strategy:
* Walk the EPUB spine in reading order.
* Open each spine document and extract image references in DOM order.
* Dedupe across the whole book (an image referenced twice is written once).
* Copy image bytes verbatim (no decode / re-encode) into the CBZ.
* Resolve the cover and write it first as 0000_cover.<ext>.
* Map EPUB2 + EPUB3 metadata to a Kavita-flavored ComicInfo.xml.
Dependency: lxml (pip install lxml)
Usage:
python epub2cbz.py book.epub
python epub2cbz.py book.epub -o /out/book.cbz --force -v
"""
from __future__ import annotations
import argparse
import logging
import posixpath
import re
import sys
import zipfile
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import unquote
from xml.sax.saxutils import escape
from lxml import etree, html
log = logging.getLogger("epub2cbz")
# --- Namespaces --------------------------------------------------------------
DC = "http://purl.org/dc/elements/1.1/"
OPF = "http://www.idpf.org/2007/opf"
CONTAINER = "urn:oasis:names:tc:opendocument:xmlns:container"
XLINK = "http://www.w3.org/1999/xlink"
# --- Lookups -----------------------------------------------------------------
# marc:relators role codes (and human-readable aliases) -> ComicInfo creator field
ROLE_TO_FIELD: dict[str, str] = {
"art": "CoverArtist", "artist": "CoverArtist",
"aut": "Writer", "author": "Writer", "creator": "Writer", "cre": "Writer",
"pbl": "Publisher", "publisher": "Publisher",
"trl": "Translator", "translator": "Translator",
"edt": "Editor", "editor": "Editor",
"ill": "Inker", "illustrator": "Inker",
"clr": "Colorist", "colorist": "Colorist",
}
MEDIA_EXT: dict[str, str] = {
"image/jpeg": ".jpg", "image/jpg": ".jpg", "image/png": ".png",
"image/gif": ".gif", "image/webp": ".webp", "image/avif": ".avif",
"image/svg+xml": ".svg", "image/bmp": ".bmp", "image/tiff": ".tiff",
}
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".bmp", ".tiff"}
# ComicInfo element emission order (schema sequence + Kavita extensions placed
# next to related standard fields). Only non-empty fields are written.
FIELD_ORDER: list[str] = [
"Title", "TitleSort", "Series", "SeriesSort", "Number", "Count", "Volume",
"AlternateSeries", "AlternateNumber", "AlternateCount", "Summary", "Notes",
"Year", "Month", "Day", "Writer", "Penciller", "Inker", "Colorist",
"Letterer", "CoverArtist", "Editor", "Translator", "Publisher", "Imprint",
"Genre", "Tags", "Web", "PageCount", "LanguageISO", "Format",
"BlackAndWhite", "Manga", "Characters", "Teams", "Locations",
"MainCharacterOrTeam", "ScanInformation", "StoryArc", "StoryArcNumber",
"SeriesGroup", "AgeRating", "CommunityRating", "UserRating", "Review",
"Isbn", "GTIN",
]
# --- Model -------------------------------------------------------------------
@dataclass
class ManifestItem:
id: str
href: str # relative to the OPF directory
full_path: str # resolved zip member name
media_type: str
properties: set[str] = field(default_factory=set)
@dataclass
class Package:
opf_dir: str
metadata: etree._Element
manifest_by_id: dict[str, ManifestItem]
manifest_by_path: dict[str, ManifestItem]
spine: list[str] # ordered list of manifest item ids
guide: dict[str, str] # guide type -> manifest item id (best effort)
# --- Small helpers -----------------------------------------------------------
def localname(tag: object) -> str:
if not isinstance(tag, str):
return ""
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
def attr_ns(el: etree._Element, name: str) -> str | None:
"""Read an attribute that may or may not carry the OPF namespace prefix."""
return el.get(f"{{{OPF}}}{name}") or el.get(name)
def clean_text(value: str | None) -> str:
return " ".join(value.split()) if value else ""
def out_ext(member: str, media_type: str | None) -> str:
ext = posixpath.splitext(member)[1].lower()
if ext == ".jpeg":
return ".jpg"
if ext in IMAGE_EXTS or ext == ".svg":
return ext
return MEDIA_EXT.get((media_type or "").lower(), ".img")
def parse_date(value: str | None) -> tuple[int, int, int]:
if not value:
return (0, 0, 0)
v = value.strip()
if len(v) == 4 and v.isdigit():
return (int(v), 0, 0)
m = re.match(r"(\d{4})(?:-(\d{1,2}))?(?:-(\d{1,2}))?", v)
if m:
y = int(m.group(1))
mo = int(m.group(2)) if m.group(2) else 0
d = int(m.group(3)) if m.group(3) else 0
return (y, mo, d)
return (0, 0, 0)
def _clean_isbn(raw: str) -> str:
raw = raw.lower().replace("urn:isbn:", "").replace("isbn:", "")
return re.sub(r"[^0-9xX]", "", raw)
def is_valid_isbn10(s: str) -> bool:
if len(s) != 10:
return False
total = 0
for i, ch in enumerate(s):
if ch in ("x", "X"):
if i != 9:
return False
val = 10
elif ch.isdigit():
val = int(ch)
else:
return False
total += val * (10 - i)
return total % 11 == 0
def is_valid_isbn13(s: str) -> bool:
if len(s) != 13 or not s.isdigit():
return False
total = sum(int(c) * (1 if i % 2 == 0 else 3) for i, c in enumerate(s))
return total % 10 == 0
# --- ComicInfo accumulator ---------------------------------------------------
class ComicInfo:
def __init__(self) -> None:
self._fields: dict[str, str] = {}
def set(self, field_name: str, value: str | None) -> None:
value = (value or "").strip()
if value:
self._fields[field_name] = value
def set_if_empty(self, field_name: str, value: str | None) -> None:
if not self._fields.get(field_name):
self.set(field_name, value)
def append(self, field_name: str, value: str | None, sep: str = ",",
escape: bool = False) -> None:
"""Append a value to a multi-value field, de-duplicating parts.
When ``escape`` is True (used for person names), any separator inside
the value is backslash-escaped so a name like "Doe, John" is preserved
as one creator instead of being split into two by a consumer.
"""
value = clean_text(value)
if not value:
return
if escape:
value = value.replace("\\", "\\\\").replace(sep, "\\" + sep)
existing = self._fields.get(field_name, "")
if not existing:
self._fields[field_name] = value
return
# Split on *unescaped* separators only, so escaped commas stay intact.
parts = [p.strip() for p in re.split(r"(?<!\\)" + re.escape(sep), existing)]
if value not in parts:
self._fields[field_name] = existing + sep + value
def to_xml(self) -> str:
lines = [
'<?xml version="1.0" encoding="utf-8"?>',
'<ComicInfo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
'xmlns:xsd="http://www.w3.org/2001/XMLSchema">',
]
for name in FIELD_ORDER:
val = self._fields.get(name)
if val:
lines.append(f" <{name}>{escape(val)}</{name}>")
lines.append("</ComicInfo>")
return "\n".join(lines)
# --- EPUB opening / OPF parsing ----------------------------------------------
def locate_opf(zf: zipfile.ZipFile) -> str:
raw = zf.read("META-INF/container.xml")
root = etree.fromstring(raw)
rootfile = root.find(f".//{{{CONTAINER}}}rootfile")
if rootfile is None or not rootfile.get("full-path"):
raise ValueError("container.xml does not reference an OPF rootfile")
return rootfile.get("full-path")
def resolve_member(candidate: str, names: set[str], names_list: list[str]) -> str | None:
if candidate in names:
return candidate
base = posixpath.basename(candidate)
for n in names_list:
if n == candidate or n.endswith("/" + base) or n == base:
return n
return None
def parse_package(zf: zipfile.ZipFile, opf_path: str) -> Package:
names_list = zf.namelist()
names = set(names_list)
opf_dir = posixpath.dirname(opf_path)
root = etree.fromstring(zf.read(opf_path))
metadata = root.find(f"{{{OPF}}}metadata")
manifest_el = root.find(f"{{{OPF}}}manifest")
spine_el = root.find(f"{{{OPF}}}spine")
guide_el = root.find(f"{{{OPF}}}guide")
if metadata is None or manifest_el is None or spine_el is None:
raise ValueError("OPF missing metadata/manifest/spine")
by_id: dict[str, ManifestItem] = {}
by_path: dict[str, ManifestItem] = {}
for item in manifest_el:
if localname(item.tag) != "item":
continue
iid = item.get("id")
href = item.get("href")
if not iid or not href:
continue
cand = posixpath.normpath(posixpath.join(opf_dir, unquote(href)))
full = resolve_member(cand, names, names_list) or cand
props = set((item.get("properties") or "").split())
mi = ManifestItem(iid, href, full, item.get("media-type", ""), props)
by_id[iid] = mi
by_path[full] = mi
spine: list[str] = []
for ref in spine_el:
if localname(ref.tag) == "itemref":
idref = ref.get("idref")
# linear="no" items are supplementary; we still include them so no
# referenced image is silently dropped.
if idref and idref in by_id:
spine.append(idref)
guide: dict[str, str] = {}
if guide_el is not None:
for ref in guide_el:
if localname(ref.tag) != "reference":
continue
gtype = (ref.get("type") or "").lower()
ghref = ref.get("href")
if gtype and ghref:
cand = posixpath.normpath(posixpath.join(opf_dir, unquote(ghref.split("#")[0])))
m = resolve_member(cand, names, names_list)
if m and m in by_path:
guide[gtype] = by_path[m].id
return Package(opf_dir, metadata, by_id, by_path, spine, guide)
# --- Image reference extraction ----------------------------------------------
def iter_image_refs(doc_bytes: bytes) -> list[str]:
"""Return image hrefs (img@src, svg <image>@href) in DOM order."""
try:
root = html.fromstring(doc_bytes)
except Exception as exc: # malformed doc - skip it, don't abort the book
log.warning("Failed to parse a spine document: %s", exc)
return []
refs: list[str] = []
for el in root.iter():
tag = localname(el.tag).lower() if isinstance(el.tag, str) else ""
if tag == "img":
ref = el.get("src")
elif tag == "image": # SVG <image>
ref = (el.get("href")
or el.get("xlink:href")
or el.get(f"{{{XLINK}}}href"))
else:
continue
if ref:
refs.append(ref)
return refs
def resolve_ref(ref: str, base_dir: str, names: set[str], names_list: list[str]) -> str | None:
ref = unquote(ref.split("#")[0]).strip()
if not ref or ref.startswith(("http://", "https://", "data:", "mailto:")):
return None
cand = posixpath.normpath(posixpath.join(base_dir, ref))
return resolve_member(cand, names, names_list)
def is_image_member(member: str, pkg: Package) -> bool:
mi = pkg.manifest_by_path.get(member)
if mi and mi.media_type.lower().startswith("image/"):
return True
return posixpath.splitext(member)[1].lower() in IMAGE_EXTS
def sequence_images(zf: zipfile.ZipFile, pkg: Package) -> list[str]:
"""Ordered, de-duplicated list of image zip-members in reading order."""
names_list = zf.namelist()
names = set(names_list)
ordered: list[str] = []
seen: set[str] = set()
for idref in pkg.spine:
mi = pkg.manifest_by_id[idref]
# Image directly in the spine (rare, but valid for fixed-layout).
if mi.media_type.lower().startswith("image/"):
if mi.full_path not in seen and mi.full_path in names:
seen.add(mi.full_path)
ordered.append(mi.full_path)
continue
# Otherwise it's a (X)HTML document - extract its images in DOM order.
try:
doc = zf.read(mi.full_path)
except KeyError:
log.warning("Spine document not found in archive: %s", mi.full_path)
continue
base_dir = posixpath.dirname(mi.full_path)
for ref in iter_image_refs(doc):
member = resolve_ref(ref, base_dir, names, names_list)
if member and member not in seen and is_image_member(member, pkg):
seen.add(member)
ordered.append(member)
return ordered
# --- Cover resolution --------------------------------------------------------
def first_image_in_doc(zf: zipfile.ZipFile, member: str, pkg: Package) -> str | None:
try:
doc = zf.read(member)
except KeyError:
return None
names_list = zf.namelist()
names = set(names_list)
base_dir = posixpath.dirname(member)
for ref in iter_image_refs(doc):
m = resolve_ref(ref, base_dir, names, names_list)
if m and is_image_member(m, pkg):
return m
return None
def resolve_cover(zf: zipfile.ZipFile, pkg: Package) -> str | None:
# 1. EPUB3: manifest item with properties="cover-image"
for mi in pkg.manifest_by_id.values():
if "cover-image" in mi.properties:
return mi.full_path
# 2. EPUB2: <meta name="cover" content="<manifest-id>">
for child in pkg.metadata:
if localname(child.tag) == "meta" and (child.get("name") or "").lower() == "cover":
mi = pkg.manifest_by_id.get(child.get("content", ""))
if mi:
if mi.media_type.lower().startswith("image/"):
return mi.full_path
img = first_image_in_doc(zf, mi.full_path, pkg)
if img:
return img
# 3. Guide reference type="cover" (usually an XHTML wrapper)
cover_id = pkg.guide.get("cover")
if cover_id and cover_id in pkg.manifest_by_id:
mi = pkg.manifest_by_id[cover_id]
if mi.media_type.lower().startswith("image/"):
return mi.full_path
img = first_image_in_doc(zf, mi.full_path, pkg)
if img:
return img
# 4. Filename fallback: an image whose name contains "cover"
for mi in pkg.manifest_by_id.values():
if (mi.media_type.lower().startswith("image/")
and "cover" in posixpath.basename(mi.full_path).lower()):
return mi.full_path
return None
# --- Metadata mapping --------------------------------------------------------
def build_comic_info(pkg: Package) -> ComicInfo:
md = pkg.metadata
ci = ComicInfo()
titles: dict[str, str] = {} # id -> title text
first_title: str | None = None
publishers: list[str] = []
subjects: list[str] = []
languages: list[str] = []
pub_dates: list[tuple[str | None, str]] = [] # (event, date)
weblinks: list[str] = []
people: list[dict] = [] # {id, name, role, source}
metas: list[etree._Element] = []
for child in md:
name = localname(child.tag)
text = clean_text(child.text)
if name == "title":
tid = child.get("id")
if tid:
titles[tid] = text
if first_title is None:
first_title = text
elif name == "description" and not ci_has(ci, "Summary"):
ci.set("Summary", text)
elif name == "publisher":
publishers.append(text)
elif name == "subject":
subjects.append(text.lower())
elif name == "language":
languages.append(text)
elif name == "date":
pub_dates.append((attr_ns(child, "event"), text))
elif name == "identifier":
scheme = (attr_ns(child, "scheme") or "").lower()
ident = (child.text or "").strip()
low = ident.lower()
if scheme == "isbn" or low.startswith(("urn:isbn:", "isbn:")):
isbn = _clean_isbn(ident)
if is_valid_isbn10(isbn) or is_valid_isbn13(isbn):
ci.set("Isbn", isbn)
if scheme == "url" or low.startswith("url:"):
weblinks.append(ident.replace("url:", "").strip())
elif name in ("creator", "contributor"):
people.append({
"id": child.get("id"),
"name": text,
"role": attr_ns(child, "role"),
"source": name,
})
elif name == "meta":
metas.append(child)
ci.set("Title", first_title)
# Pass 1: EPUB3 role refinements (meta property="role" refines="#id")
refined_roles: dict[str, str] = {}
for m in metas:
if (m.get("property") or "").lower() == "role":
scheme = (m.get("scheme") or "").lower()
if scheme and scheme != "marc:relators":
continue
cid = (m.get("refines") or "").lstrip("#")
if cid:
refined_roles[cid] = clean_text(m.text)
# Assign people to creator fields
for p in people:
nm = clean_text(p["name"])
if not nm:
continue
role = refined_roles.get(p["id"] or "") or p["role"] or ""
f = ROLE_TO_FIELD.get(role.lower())
if f:
ci.append(f, nm, escape=True)
elif p["source"] == "creator":
ci.append("Writer", nm, escape=True) # unrefined creators -> Writer
# Pass 2: calibre (EPUB2) + EPUB3 collection/title metas
for m in metas:
nm = (m.get("name") or "").lower()
content_attr = m.get("content")
if nm:
if nm == "calibre:series":
ci.set("Series", content_attr)
ci.set_if_empty("SeriesSort", content_attr)
elif nm == "calibre:series_index":
ci.set("Volume", content_attr)
elif nm == "calibre:title_sort":
ci.set("TitleSort", content_attr)
elif nm == "calibre:rating":
ci.set("UserRating", (content_attr or "").strip())
prop = (m.get("property") or "").lower()
val = clean_text(m.text)
if prop == "group-position":
ci.set("Volume", val)
elif prop == "belongs-to-collection":
ci.set("Series", val)
ci.set_if_empty("SeriesSort", val)
elif prop == "title-type":
_handle_title_type(m, metas, titles, ci, val)
# Publisher / Genre / Language / Date / Web
if publishers:
ci.set("Publisher", ",".join(p for p in publishers if p))
for s in subjects:
ci.append("Genre", s)
if languages and languages[0]:
ci.set("LanguageISO", languages[0])
if weblinks:
ci.set("Web", ",".join(dict.fromkeys(w for w in weblinks if w)))
date_str = next((d for ev, d in pub_dates if ev == "publication"), None)
if not date_str:
date_str = next((d for _, d in pub_dates if d), None)
year, month, day = parse_date(date_str)
if year:
ci.set("Year", str(year))
if month:
ci.set("Month", str(month))
if day:
ci.set("Day", str(day))
# Standalone heuristic: no series and no volume -> single book
if not ci_has(ci, "Series") and not ci_has(ci, "Volume"):
ci.set("Count", "1")
ci.set("Notes",
f"Converted from EPUB by epub2cbz on "
f"{datetime.now(timezone.utc).strftime('%Y-%m-%d')}")
return ci
def _handle_title_type(meta: etree._Element, metas: list[etree._Element],
titles: dict[str, str], ci: ComicInfo, content: str) -> None:
refines = meta.get("refines") or ""
tid = refines.lstrip("#")
content = content.lower()
if content == "main":
# SeriesSort from the "file-as" refinement of the main title
for m in metas:
if (m.get("property") or "").lower() == "file-as" and (m.get("refines") or "") == refines:
ci.set("SeriesSort", clean_text(m.text))
return
elif content == "collection":
title = titles.get(tid, "")
if not title:
return
seq = next((clean_text(m.text) for m in metas
if (m.get("property") or "").lower() == "display-seq"
and (m.get("refines") or "") == refines), "")
if not seq or seq == "0":
ci.append("SeriesGroup", title.replace(",", "_"))
else:
ci.append("AlternateSeries", title.replace(",", "_"))
ci.append("AlternateNumber", seq)
def ci_has(ci: ComicInfo, field_name: str) -> bool:
return bool(ci._fields.get(field_name))
# --- CBZ writing -------------------------------------------------------------
def write_cbz(src: zipfile.ZipFile, out_path: Path, cover: str | None,
pages: list[str], pkg: Package, ci: ComicInfo, pad: int) -> int:
written = 0
with zipfile.ZipFile(out_path, "w", zipfile.ZIP_STORED) as out:
# Pre-compute page count for ComicInfo
total = len(pages) + (1 if cover else 0)
ci.set("PageCount", str(total))
out.writestr("ComicInfo.xml", ci.to_xml().encode("utf-8"))
if cover:
mi = pkg.manifest_by_path.get(cover)
ext = out_ext(cover, mi.media_type if mi else None)
out.writestr(f"0000_cover{ext}", src.read(cover))
written += 1
for i, member in enumerate(pages, start=1):
mi = pkg.manifest_by_path.get(member)
ext = out_ext(member, mi.media_type if mi else None)
out.writestr(f"{i:0{pad}d}{ext}", src.read(member))
written += 1
return written
# --- Orchestration -----------------------------------------------------------
def convert(epub_path: Path, out_path: Path) -> int:
with zipfile.ZipFile(epub_path) as zf:
opf_path = locate_opf(zf)
log.debug("OPF: %s", opf_path)
pkg = parse_package(zf, opf_path)
cover = resolve_cover(zf, pkg)
log.debug("Cover: %s", cover)
pages = sequence_images(zf, pkg)
if cover and cover in pages:
pages.remove(cover) # avoid writing the cover twice
if not cover and not pages:
raise ValueError("No images found in the EPUB")
ci = build_comic_info(pkg)
pad = max(4, len(str(len(pages))))
return write_cbz(zf, out_path, cover, pages, pkg, ci, pad)
def find_epubs(directory: Path, recursive: bool) -> list[Path]:
it = directory.rglob("*") if recursive else directory.glob("*")
return sorted(p for p in it if p.is_file() and p.suffix.lower() == ".epub")
def run_one(epub_path: Path, out_path: Path, force: bool) -> str:
"""Convert a single file. Returns 'ok', 'skipped', or 'failed'."""
if out_path.exists() and not force:
log.warning("Skipping (output exists): %s", out_path)
return "skipped"
try:
count = convert(epub_path, out_path)
except (zipfile.BadZipFile, ValueError, etree.XMLSyntaxError, KeyError, OSError) as exc:
log.error("Failed: %s (%s)", epub_path, exc)
return "failed"
log.info("Wrote %d images -> %s", count, out_path)
return "ok"
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Convert EPUB(s) to CBZ.")
parser.add_argument("path", type=Path,
help="Input .epub file, or a directory when --batch is set")
parser.add_argument("-o", "--output", type=Path,
help="Output .cbz path (single-file mode only)")
parser.add_argument("-b", "--batch", action="store_true",
help="Treat PATH as a directory and convert every .epub found")
parser.add_argument("-r", "--recursive", action=argparse.BooleanOptionalAction,
default=True,
help="Recurse into subdirectories in batch mode (default: on)")
parser.add_argument("-f", "--force", action="store_true",
help="Overwrite existing .cbz output(s)")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose logging")
args = parser.parse_args(argv)
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO,
format="%(levelname)s: %(message)s")
if args.batch:
if args.output:
log.error("--output cannot be used with --batch")
return 2
if not args.path.is_dir():
log.error("Batch mode expects a directory: %s", args.path)
return 2
files = find_epubs(args.path, args.recursive)
if not files:
log.error("No .epub files found under %s", args.path)
return 2
tally = {"ok": 0, "skipped": 0, "failed": 0}
for f in files:
tally[run_one(f, f.with_suffix(".cbz"), args.force)] += 1
log.info("Batch complete: %d converted, %d skipped, %d failed",
tally["ok"], tally["skipped"], tally["failed"])
return 1 if tally["failed"] else 0
# Single-file mode
if not args.path.is_file():
log.error("Input not found: %s", args.path)
return 2
out_path = args.output or args.path.with_suffix(".cbz")
if out_path.exists() and not args.force:
log.error("Output exists (use --force to overwrite): %s", out_path)
return 3
return 0 if run_one(args.path, out_path, force=True) == "ok" else 1
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment