Created
May 30, 2026 13:31
-
-
Save majora2007/3bf9bd9417b36a7e7cd4cd11cfb5906e to your computer and use it in GitHub Desktop.
Convert Image-based Epubs -> CBZs with Metadata
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| epub2cbz - Convert an EPUB (2 or 3) to a CBZ. | |
| Strategy: | |
| * Walk the EPUB spine in reading order. | |
| * Open each spine document and extract image references in DOM order. | |
| * Dedupe across the whole book (an image referenced twice is written once). | |
| * Copy image bytes verbatim (no decode / re-encode) into the CBZ. | |
| * Resolve the cover and write it first as 0000_cover.<ext>. | |
| * Map EPUB2 + EPUB3 metadata to a Kavita-flavored ComicInfo.xml. | |
| Dependency: lxml (pip install lxml) | |
| Usage: | |
| python epub2cbz.py book.epub | |
| python epub2cbz.py book.epub -o /out/book.cbz --force -v | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import logging | |
| import posixpath | |
| import re | |
| import sys | |
| import zipfile | |
| from dataclasses import dataclass, field | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from urllib.parse import unquote | |
| from xml.sax.saxutils import escape | |
| from lxml import etree, html | |
| log = logging.getLogger("epub2cbz") | |
| # --- Namespaces -------------------------------------------------------------- | |
| DC = "http://purl.org/dc/elements/1.1/" | |
| OPF = "http://www.idpf.org/2007/opf" | |
| CONTAINER = "urn:oasis:names:tc:opendocument:xmlns:container" | |
| XLINK = "http://www.w3.org/1999/xlink" | |
| # --- Lookups ----------------------------------------------------------------- | |
| # marc:relators role codes (and human-readable aliases) -> ComicInfo creator field | |
| ROLE_TO_FIELD: dict[str, str] = { | |
| "art": "CoverArtist", "artist": "CoverArtist", | |
| "aut": "Writer", "author": "Writer", "creator": "Writer", "cre": "Writer", | |
| "pbl": "Publisher", "publisher": "Publisher", | |
| "trl": "Translator", "translator": "Translator", | |
| "edt": "Editor", "editor": "Editor", | |
| "ill": "Inker", "illustrator": "Inker", | |
| "clr": "Colorist", "colorist": "Colorist", | |
| } | |
| MEDIA_EXT: dict[str, str] = { | |
| "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/png": ".png", | |
| "image/gif": ".gif", "image/webp": ".webp", "image/avif": ".avif", | |
| "image/svg+xml": ".svg", "image/bmp": ".bmp", "image/tiff": ".tiff", | |
| } | |
| IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".bmp", ".tiff"} | |
| # ComicInfo element emission order (schema sequence + Kavita extensions placed | |
| # next to related standard fields). Only non-empty fields are written. | |
| FIELD_ORDER: list[str] = [ | |
| "Title", "TitleSort", "Series", "SeriesSort", "Number", "Count", "Volume", | |
| "AlternateSeries", "AlternateNumber", "AlternateCount", "Summary", "Notes", | |
| "Year", "Month", "Day", "Writer", "Penciller", "Inker", "Colorist", | |
| "Letterer", "CoverArtist", "Editor", "Translator", "Publisher", "Imprint", | |
| "Genre", "Tags", "Web", "PageCount", "LanguageISO", "Format", | |
| "BlackAndWhite", "Manga", "Characters", "Teams", "Locations", | |
| "MainCharacterOrTeam", "ScanInformation", "StoryArc", "StoryArcNumber", | |
| "SeriesGroup", "AgeRating", "CommunityRating", "UserRating", "Review", | |
| "Isbn", "GTIN", | |
| ] | |
| # --- Model ------------------------------------------------------------------- | |
| @dataclass | |
| class ManifestItem: | |
| id: str | |
| href: str # relative to the OPF directory | |
| full_path: str # resolved zip member name | |
| media_type: str | |
| properties: set[str] = field(default_factory=set) | |
| @dataclass | |
| class Package: | |
| opf_dir: str | |
| metadata: etree._Element | |
| manifest_by_id: dict[str, ManifestItem] | |
| manifest_by_path: dict[str, ManifestItem] | |
| spine: list[str] # ordered list of manifest item ids | |
| guide: dict[str, str] # guide type -> manifest item id (best effort) | |
| # --- Small helpers ----------------------------------------------------------- | |
| def localname(tag: object) -> str: | |
| if not isinstance(tag, str): | |
| return "" | |
| return tag.rsplit("}", 1)[-1] if "}" in tag else tag | |
| def attr_ns(el: etree._Element, name: str) -> str | None: | |
| """Read an attribute that may or may not carry the OPF namespace prefix.""" | |
| return el.get(f"{{{OPF}}}{name}") or el.get(name) | |
| def clean_text(value: str | None) -> str: | |
| return " ".join(value.split()) if value else "" | |
| def out_ext(member: str, media_type: str | None) -> str: | |
| ext = posixpath.splitext(member)[1].lower() | |
| if ext == ".jpeg": | |
| return ".jpg" | |
| if ext in IMAGE_EXTS or ext == ".svg": | |
| return ext | |
| return MEDIA_EXT.get((media_type or "").lower(), ".img") | |
| def parse_date(value: str | None) -> tuple[int, int, int]: | |
| if not value: | |
| return (0, 0, 0) | |
| v = value.strip() | |
| if len(v) == 4 and v.isdigit(): | |
| return (int(v), 0, 0) | |
| m = re.match(r"(\d{4})(?:-(\d{1,2}))?(?:-(\d{1,2}))?", v) | |
| if m: | |
| y = int(m.group(1)) | |
| mo = int(m.group(2)) if m.group(2) else 0 | |
| d = int(m.group(3)) if m.group(3) else 0 | |
| return (y, mo, d) | |
| return (0, 0, 0) | |
| def _clean_isbn(raw: str) -> str: | |
| raw = raw.lower().replace("urn:isbn:", "").replace("isbn:", "") | |
| return re.sub(r"[^0-9xX]", "", raw) | |
| def is_valid_isbn10(s: str) -> bool: | |
| if len(s) != 10: | |
| return False | |
| total = 0 | |
| for i, ch in enumerate(s): | |
| if ch in ("x", "X"): | |
| if i != 9: | |
| return False | |
| val = 10 | |
| elif ch.isdigit(): | |
| val = int(ch) | |
| else: | |
| return False | |
| total += val * (10 - i) | |
| return total % 11 == 0 | |
| def is_valid_isbn13(s: str) -> bool: | |
| if len(s) != 13 or not s.isdigit(): | |
| return False | |
| total = sum(int(c) * (1 if i % 2 == 0 else 3) for i, c in enumerate(s)) | |
| return total % 10 == 0 | |
| # --- ComicInfo accumulator --------------------------------------------------- | |
| class ComicInfo: | |
| def __init__(self) -> None: | |
| self._fields: dict[str, str] = {} | |
| def set(self, field_name: str, value: str | None) -> None: | |
| value = (value or "").strip() | |
| if value: | |
| self._fields[field_name] = value | |
| def set_if_empty(self, field_name: str, value: str | None) -> None: | |
| if not self._fields.get(field_name): | |
| self.set(field_name, value) | |
| def append(self, field_name: str, value: str | None, sep: str = ",", | |
| escape: bool = False) -> None: | |
| """Append a value to a multi-value field, de-duplicating parts. | |
| When ``escape`` is True (used for person names), any separator inside | |
| the value is backslash-escaped so a name like "Doe, John" is preserved | |
| as one creator instead of being split into two by a consumer. | |
| """ | |
| value = clean_text(value) | |
| if not value: | |
| return | |
| if escape: | |
| value = value.replace("\\", "\\\\").replace(sep, "\\" + sep) | |
| existing = self._fields.get(field_name, "") | |
| if not existing: | |
| self._fields[field_name] = value | |
| return | |
| # Split on *unescaped* separators only, so escaped commas stay intact. | |
| parts = [p.strip() for p in re.split(r"(?<!\\)" + re.escape(sep), existing)] | |
| if value not in parts: | |
| self._fields[field_name] = existing + sep + value | |
| def to_xml(self) -> str: | |
| lines = [ | |
| '<?xml version="1.0" encoding="utf-8"?>', | |
| '<ComicInfo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' | |
| 'xmlns:xsd="http://www.w3.org/2001/XMLSchema">', | |
| ] | |
| for name in FIELD_ORDER: | |
| val = self._fields.get(name) | |
| if val: | |
| lines.append(f" <{name}>{escape(val)}</{name}>") | |
| lines.append("</ComicInfo>") | |
| return "\n".join(lines) | |
| # --- EPUB opening / OPF parsing ---------------------------------------------- | |
| def locate_opf(zf: zipfile.ZipFile) -> str: | |
| raw = zf.read("META-INF/container.xml") | |
| root = etree.fromstring(raw) | |
| rootfile = root.find(f".//{{{CONTAINER}}}rootfile") | |
| if rootfile is None or not rootfile.get("full-path"): | |
| raise ValueError("container.xml does not reference an OPF rootfile") | |
| return rootfile.get("full-path") | |
| def resolve_member(candidate: str, names: set[str], names_list: list[str]) -> str | None: | |
| if candidate in names: | |
| return candidate | |
| base = posixpath.basename(candidate) | |
| for n in names_list: | |
| if n == candidate or n.endswith("/" + base) or n == base: | |
| return n | |
| return None | |
| def parse_package(zf: zipfile.ZipFile, opf_path: str) -> Package: | |
| names_list = zf.namelist() | |
| names = set(names_list) | |
| opf_dir = posixpath.dirname(opf_path) | |
| root = etree.fromstring(zf.read(opf_path)) | |
| metadata = root.find(f"{{{OPF}}}metadata") | |
| manifest_el = root.find(f"{{{OPF}}}manifest") | |
| spine_el = root.find(f"{{{OPF}}}spine") | |
| guide_el = root.find(f"{{{OPF}}}guide") | |
| if metadata is None or manifest_el is None or spine_el is None: | |
| raise ValueError("OPF missing metadata/manifest/spine") | |
| by_id: dict[str, ManifestItem] = {} | |
| by_path: dict[str, ManifestItem] = {} | |
| for item in manifest_el: | |
| if localname(item.tag) != "item": | |
| continue | |
| iid = item.get("id") | |
| href = item.get("href") | |
| if not iid or not href: | |
| continue | |
| cand = posixpath.normpath(posixpath.join(opf_dir, unquote(href))) | |
| full = resolve_member(cand, names, names_list) or cand | |
| props = set((item.get("properties") or "").split()) | |
| mi = ManifestItem(iid, href, full, item.get("media-type", ""), props) | |
| by_id[iid] = mi | |
| by_path[full] = mi | |
| spine: list[str] = [] | |
| for ref in spine_el: | |
| if localname(ref.tag) == "itemref": | |
| idref = ref.get("idref") | |
| # linear="no" items are supplementary; we still include them so no | |
| # referenced image is silently dropped. | |
| if idref and idref in by_id: | |
| spine.append(idref) | |
| guide: dict[str, str] = {} | |
| if guide_el is not None: | |
| for ref in guide_el: | |
| if localname(ref.tag) != "reference": | |
| continue | |
| gtype = (ref.get("type") or "").lower() | |
| ghref = ref.get("href") | |
| if gtype and ghref: | |
| cand = posixpath.normpath(posixpath.join(opf_dir, unquote(ghref.split("#")[0]))) | |
| m = resolve_member(cand, names, names_list) | |
| if m and m in by_path: | |
| guide[gtype] = by_path[m].id | |
| return Package(opf_dir, metadata, by_id, by_path, spine, guide) | |
| # --- Image reference extraction ---------------------------------------------- | |
| def iter_image_refs(doc_bytes: bytes) -> list[str]: | |
| """Return image hrefs (img@src, svg <image>@href) in DOM order.""" | |
| try: | |
| root = html.fromstring(doc_bytes) | |
| except Exception as exc: # malformed doc - skip it, don't abort the book | |
| log.warning("Failed to parse a spine document: %s", exc) | |
| return [] | |
| refs: list[str] = [] | |
| for el in root.iter(): | |
| tag = localname(el.tag).lower() if isinstance(el.tag, str) else "" | |
| if tag == "img": | |
| ref = el.get("src") | |
| elif tag == "image": # SVG <image> | |
| ref = (el.get("href") | |
| or el.get("xlink:href") | |
| or el.get(f"{{{XLINK}}}href")) | |
| else: | |
| continue | |
| if ref: | |
| refs.append(ref) | |
| return refs | |
| def resolve_ref(ref: str, base_dir: str, names: set[str], names_list: list[str]) -> str | None: | |
| ref = unquote(ref.split("#")[0]).strip() | |
| if not ref or ref.startswith(("http://", "https://", "data:", "mailto:")): | |
| return None | |
| cand = posixpath.normpath(posixpath.join(base_dir, ref)) | |
| return resolve_member(cand, names, names_list) | |
| def is_image_member(member: str, pkg: Package) -> bool: | |
| mi = pkg.manifest_by_path.get(member) | |
| if mi and mi.media_type.lower().startswith("image/"): | |
| return True | |
| return posixpath.splitext(member)[1].lower() in IMAGE_EXTS | |
| def sequence_images(zf: zipfile.ZipFile, pkg: Package) -> list[str]: | |
| """Ordered, de-duplicated list of image zip-members in reading order.""" | |
| names_list = zf.namelist() | |
| names = set(names_list) | |
| ordered: list[str] = [] | |
| seen: set[str] = set() | |
| for idref in pkg.spine: | |
| mi = pkg.manifest_by_id[idref] | |
| # Image directly in the spine (rare, but valid for fixed-layout). | |
| if mi.media_type.lower().startswith("image/"): | |
| if mi.full_path not in seen and mi.full_path in names: | |
| seen.add(mi.full_path) | |
| ordered.append(mi.full_path) | |
| continue | |
| # Otherwise it's a (X)HTML document - extract its images in DOM order. | |
| try: | |
| doc = zf.read(mi.full_path) | |
| except KeyError: | |
| log.warning("Spine document not found in archive: %s", mi.full_path) | |
| continue | |
| base_dir = posixpath.dirname(mi.full_path) | |
| for ref in iter_image_refs(doc): | |
| member = resolve_ref(ref, base_dir, names, names_list) | |
| if member and member not in seen and is_image_member(member, pkg): | |
| seen.add(member) | |
| ordered.append(member) | |
| return ordered | |
| # --- Cover resolution -------------------------------------------------------- | |
| def first_image_in_doc(zf: zipfile.ZipFile, member: str, pkg: Package) -> str | None: | |
| try: | |
| doc = zf.read(member) | |
| except KeyError: | |
| return None | |
| names_list = zf.namelist() | |
| names = set(names_list) | |
| base_dir = posixpath.dirname(member) | |
| for ref in iter_image_refs(doc): | |
| m = resolve_ref(ref, base_dir, names, names_list) | |
| if m and is_image_member(m, pkg): | |
| return m | |
| return None | |
| def resolve_cover(zf: zipfile.ZipFile, pkg: Package) -> str | None: | |
| # 1. EPUB3: manifest item with properties="cover-image" | |
| for mi in pkg.manifest_by_id.values(): | |
| if "cover-image" in mi.properties: | |
| return mi.full_path | |
| # 2. EPUB2: <meta name="cover" content="<manifest-id>"> | |
| for child in pkg.metadata: | |
| if localname(child.tag) == "meta" and (child.get("name") or "").lower() == "cover": | |
| mi = pkg.manifest_by_id.get(child.get("content", "")) | |
| if mi: | |
| if mi.media_type.lower().startswith("image/"): | |
| return mi.full_path | |
| img = first_image_in_doc(zf, mi.full_path, pkg) | |
| if img: | |
| return img | |
| # 3. Guide reference type="cover" (usually an XHTML wrapper) | |
| cover_id = pkg.guide.get("cover") | |
| if cover_id and cover_id in pkg.manifest_by_id: | |
| mi = pkg.manifest_by_id[cover_id] | |
| if mi.media_type.lower().startswith("image/"): | |
| return mi.full_path | |
| img = first_image_in_doc(zf, mi.full_path, pkg) | |
| if img: | |
| return img | |
| # 4. Filename fallback: an image whose name contains "cover" | |
| for mi in pkg.manifest_by_id.values(): | |
| if (mi.media_type.lower().startswith("image/") | |
| and "cover" in posixpath.basename(mi.full_path).lower()): | |
| return mi.full_path | |
| return None | |
| # --- Metadata mapping -------------------------------------------------------- | |
| def build_comic_info(pkg: Package) -> ComicInfo: | |
| md = pkg.metadata | |
| ci = ComicInfo() | |
| titles: dict[str, str] = {} # id -> title text | |
| first_title: str | None = None | |
| publishers: list[str] = [] | |
| subjects: list[str] = [] | |
| languages: list[str] = [] | |
| pub_dates: list[tuple[str | None, str]] = [] # (event, date) | |
| weblinks: list[str] = [] | |
| people: list[dict] = [] # {id, name, role, source} | |
| metas: list[etree._Element] = [] | |
| for child in md: | |
| name = localname(child.tag) | |
| text = clean_text(child.text) | |
| if name == "title": | |
| tid = child.get("id") | |
| if tid: | |
| titles[tid] = text | |
| if first_title is None: | |
| first_title = text | |
| elif name == "description" and not ci_has(ci, "Summary"): | |
| ci.set("Summary", text) | |
| elif name == "publisher": | |
| publishers.append(text) | |
| elif name == "subject": | |
| subjects.append(text.lower()) | |
| elif name == "language": | |
| languages.append(text) | |
| elif name == "date": | |
| pub_dates.append((attr_ns(child, "event"), text)) | |
| elif name == "identifier": | |
| scheme = (attr_ns(child, "scheme") or "").lower() | |
| ident = (child.text or "").strip() | |
| low = ident.lower() | |
| if scheme == "isbn" or low.startswith(("urn:isbn:", "isbn:")): | |
| isbn = _clean_isbn(ident) | |
| if is_valid_isbn10(isbn) or is_valid_isbn13(isbn): | |
| ci.set("Isbn", isbn) | |
| if scheme == "url" or low.startswith("url:"): | |
| weblinks.append(ident.replace("url:", "").strip()) | |
| elif name in ("creator", "contributor"): | |
| people.append({ | |
| "id": child.get("id"), | |
| "name": text, | |
| "role": attr_ns(child, "role"), | |
| "source": name, | |
| }) | |
| elif name == "meta": | |
| metas.append(child) | |
| ci.set("Title", first_title) | |
| # Pass 1: EPUB3 role refinements (meta property="role" refines="#id") | |
| refined_roles: dict[str, str] = {} | |
| for m in metas: | |
| if (m.get("property") or "").lower() == "role": | |
| scheme = (m.get("scheme") or "").lower() | |
| if scheme and scheme != "marc:relators": | |
| continue | |
| cid = (m.get("refines") or "").lstrip("#") | |
| if cid: | |
| refined_roles[cid] = clean_text(m.text) | |
| # Assign people to creator fields | |
| for p in people: | |
| nm = clean_text(p["name"]) | |
| if not nm: | |
| continue | |
| role = refined_roles.get(p["id"] or "") or p["role"] or "" | |
| f = ROLE_TO_FIELD.get(role.lower()) | |
| if f: | |
| ci.append(f, nm, escape=True) | |
| elif p["source"] == "creator": | |
| ci.append("Writer", nm, escape=True) # unrefined creators -> Writer | |
| # Pass 2: calibre (EPUB2) + EPUB3 collection/title metas | |
| for m in metas: | |
| nm = (m.get("name") or "").lower() | |
| content_attr = m.get("content") | |
| if nm: | |
| if nm == "calibre:series": | |
| ci.set("Series", content_attr) | |
| ci.set_if_empty("SeriesSort", content_attr) | |
| elif nm == "calibre:series_index": | |
| ci.set("Volume", content_attr) | |
| elif nm == "calibre:title_sort": | |
| ci.set("TitleSort", content_attr) | |
| elif nm == "calibre:rating": | |
| ci.set("UserRating", (content_attr or "").strip()) | |
| prop = (m.get("property") or "").lower() | |
| val = clean_text(m.text) | |
| if prop == "group-position": | |
| ci.set("Volume", val) | |
| elif prop == "belongs-to-collection": | |
| ci.set("Series", val) | |
| ci.set_if_empty("SeriesSort", val) | |
| elif prop == "title-type": | |
| _handle_title_type(m, metas, titles, ci, val) | |
| # Publisher / Genre / Language / Date / Web | |
| if publishers: | |
| ci.set("Publisher", ",".join(p for p in publishers if p)) | |
| for s in subjects: | |
| ci.append("Genre", s) | |
| if languages and languages[0]: | |
| ci.set("LanguageISO", languages[0]) | |
| if weblinks: | |
| ci.set("Web", ",".join(dict.fromkeys(w for w in weblinks if w))) | |
| date_str = next((d for ev, d in pub_dates if ev == "publication"), None) | |
| if not date_str: | |
| date_str = next((d for _, d in pub_dates if d), None) | |
| year, month, day = parse_date(date_str) | |
| if year: | |
| ci.set("Year", str(year)) | |
| if month: | |
| ci.set("Month", str(month)) | |
| if day: | |
| ci.set("Day", str(day)) | |
| # Standalone heuristic: no series and no volume -> single book | |
| if not ci_has(ci, "Series") and not ci_has(ci, "Volume"): | |
| ci.set("Count", "1") | |
| ci.set("Notes", | |
| f"Converted from EPUB by epub2cbz on " | |
| f"{datetime.now(timezone.utc).strftime('%Y-%m-%d')}") | |
| return ci | |
| def _handle_title_type(meta: etree._Element, metas: list[etree._Element], | |
| titles: dict[str, str], ci: ComicInfo, content: str) -> None: | |
| refines = meta.get("refines") or "" | |
| tid = refines.lstrip("#") | |
| content = content.lower() | |
| if content == "main": | |
| # SeriesSort from the "file-as" refinement of the main title | |
| for m in metas: | |
| if (m.get("property") or "").lower() == "file-as" and (m.get("refines") or "") == refines: | |
| ci.set("SeriesSort", clean_text(m.text)) | |
| return | |
| elif content == "collection": | |
| title = titles.get(tid, "") | |
| if not title: | |
| return | |
| seq = next((clean_text(m.text) for m in metas | |
| if (m.get("property") or "").lower() == "display-seq" | |
| and (m.get("refines") or "") == refines), "") | |
| if not seq or seq == "0": | |
| ci.append("SeriesGroup", title.replace(",", "_")) | |
| else: | |
| ci.append("AlternateSeries", title.replace(",", "_")) | |
| ci.append("AlternateNumber", seq) | |
| def ci_has(ci: ComicInfo, field_name: str) -> bool: | |
| return bool(ci._fields.get(field_name)) | |
| # --- CBZ writing ------------------------------------------------------------- | |
| def write_cbz(src: zipfile.ZipFile, out_path: Path, cover: str | None, | |
| pages: list[str], pkg: Package, ci: ComicInfo, pad: int) -> int: | |
| written = 0 | |
| with zipfile.ZipFile(out_path, "w", zipfile.ZIP_STORED) as out: | |
| # Pre-compute page count for ComicInfo | |
| total = len(pages) + (1 if cover else 0) | |
| ci.set("PageCount", str(total)) | |
| out.writestr("ComicInfo.xml", ci.to_xml().encode("utf-8")) | |
| if cover: | |
| mi = pkg.manifest_by_path.get(cover) | |
| ext = out_ext(cover, mi.media_type if mi else None) | |
| out.writestr(f"0000_cover{ext}", src.read(cover)) | |
| written += 1 | |
| for i, member in enumerate(pages, start=1): | |
| mi = pkg.manifest_by_path.get(member) | |
| ext = out_ext(member, mi.media_type if mi else None) | |
| out.writestr(f"{i:0{pad}d}{ext}", src.read(member)) | |
| written += 1 | |
| return written | |
| # --- Orchestration ----------------------------------------------------------- | |
| def convert(epub_path: Path, out_path: Path) -> int: | |
| with zipfile.ZipFile(epub_path) as zf: | |
| opf_path = locate_opf(zf) | |
| log.debug("OPF: %s", opf_path) | |
| pkg = parse_package(zf, opf_path) | |
| cover = resolve_cover(zf, pkg) | |
| log.debug("Cover: %s", cover) | |
| pages = sequence_images(zf, pkg) | |
| if cover and cover in pages: | |
| pages.remove(cover) # avoid writing the cover twice | |
| if not cover and not pages: | |
| raise ValueError("No images found in the EPUB") | |
| ci = build_comic_info(pkg) | |
| pad = max(4, len(str(len(pages)))) | |
| return write_cbz(zf, out_path, cover, pages, pkg, ci, pad) | |
| def find_epubs(directory: Path, recursive: bool) -> list[Path]: | |
| it = directory.rglob("*") if recursive else directory.glob("*") | |
| return sorted(p for p in it if p.is_file() and p.suffix.lower() == ".epub") | |
| def run_one(epub_path: Path, out_path: Path, force: bool) -> str: | |
| """Convert a single file. Returns 'ok', 'skipped', or 'failed'.""" | |
| if out_path.exists() and not force: | |
| log.warning("Skipping (output exists): %s", out_path) | |
| return "skipped" | |
| try: | |
| count = convert(epub_path, out_path) | |
| except (zipfile.BadZipFile, ValueError, etree.XMLSyntaxError, KeyError, OSError) as exc: | |
| log.error("Failed: %s (%s)", epub_path, exc) | |
| return "failed" | |
| log.info("Wrote %d images -> %s", count, out_path) | |
| return "ok" | |
| def main(argv: list[str] | None = None) -> int: | |
| parser = argparse.ArgumentParser(description="Convert EPUB(s) to CBZ.") | |
| parser.add_argument("path", type=Path, | |
| help="Input .epub file, or a directory when --batch is set") | |
| parser.add_argument("-o", "--output", type=Path, | |
| help="Output .cbz path (single-file mode only)") | |
| parser.add_argument("-b", "--batch", action="store_true", | |
| help="Treat PATH as a directory and convert every .epub found") | |
| parser.add_argument("-r", "--recursive", action=argparse.BooleanOptionalAction, | |
| default=True, | |
| help="Recurse into subdirectories in batch mode (default: on)") | |
| parser.add_argument("-f", "--force", action="store_true", | |
| help="Overwrite existing .cbz output(s)") | |
| parser.add_argument("-v", "--verbose", action="store_true", help="Verbose logging") | |
| args = parser.parse_args(argv) | |
| logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, | |
| format="%(levelname)s: %(message)s") | |
| if args.batch: | |
| if args.output: | |
| log.error("--output cannot be used with --batch") | |
| return 2 | |
| if not args.path.is_dir(): | |
| log.error("Batch mode expects a directory: %s", args.path) | |
| return 2 | |
| files = find_epubs(args.path, args.recursive) | |
| if not files: | |
| log.error("No .epub files found under %s", args.path) | |
| return 2 | |
| tally = {"ok": 0, "skipped": 0, "failed": 0} | |
| for f in files: | |
| tally[run_one(f, f.with_suffix(".cbz"), args.force)] += 1 | |
| log.info("Batch complete: %d converted, %d skipped, %d failed", | |
| tally["ok"], tally["skipped"], tally["failed"]) | |
| return 1 if tally["failed"] else 0 | |
| # Single-file mode | |
| if not args.path.is_file(): | |
| log.error("Input not found: %s", args.path) | |
| return 2 | |
| out_path = args.output or args.path.with_suffix(".cbz") | |
| if out_path.exists() and not args.force: | |
| log.error("Output exists (use --force to overwrite): %s", out_path) | |
| return 3 | |
| return 0 if run_one(args.path, out_path, force=True) == "ok" else 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment