Created
May 27, 2024 13:24
-
-
Save rgaudin/30d6461947b597357e0a7a7e6131dd7c to your computer and use it in GitHub Desktop.
zimwriterfs python implementation (based on zimscraperlib)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" zimwriterfs alternative implementation | |
requires libmagic1 and zimscraperlib (python) | |
WARN: the following zimwriterfs features are not implemented: | |
`--inflateHtml` | |
`--skip-libmagic-check` | |
""" | |
import argparse | |
import datetime | |
import logging | |
from pathlib import Path | |
from zimscraperlib.__about__ import __version__ | |
from zimscraperlib.constants import ( | |
MAXIMUM_DESCRIPTION_METADATA_LENGTH, | |
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, | |
RECOMMENDED_MAX_TITLE_LENGTH, | |
) | |
from zimscraperlib.logging import getLogger | |
from zimscraperlib.zim.creator import Creator | |
from zimscraperlib.zim.filesystem import add_redirects_to_zim, add_to_zim | |
SCRAPER = f"zimwriterfs(py) {__version__}" | |
logger = getLogger("zimwriterfs", level=logging.INFO) | |
def main( | |
*, | |
src_dir: str, | |
dst_path: str, | |
name: str, | |
language: str, | |
title: str, | |
description: str, | |
creator: str, | |
publisher: str, | |
illus_fname: str, | |
tags: str, | |
welcome: str, | |
verbose: bool, | |
threads: int, | |
without_ft_index: bool, | |
redirects_path: str | None = None, | |
long_description: str | None = None, | |
flavour: str | None = None, | |
source: str | None = None, | |
scraper: str | None = None, | |
license_m: str | None = None, | |
relation: str | None = None, | |
cluster_size: int | None = None, | |
): | |
if verbose: | |
logger = getLogger("zimwriterfs", level=logging.DEBUG) | |
logger.info(f"Dumping {src_dir} into {dst_path}") | |
build_dir = Path(src_dir).expanduser().resolve() | |
if not build_dir.exists() or not build_dir.is_dir(): | |
raise OSError(f"Incorrect build_dir: {build_dir}") | |
illustration_fpath = build_dir / illus_fname | |
if not illustration_fpath.exists() or not illustration_fpath.is_file(): | |
raise OSError(f"Incorrect illustration: {illus_fname} ({illustration_fpath})") | |
with open(illustration_fpath, "rb") as fh: | |
illustration_data = fh.read() | |
redirects_fpath = Path(redirects_path) if redirects_path else None | |
zim_file = ( | |
Creator( | |
filename=Path(dst_path).expanduser().resolve(), | |
main_path=welcome, | |
ignore_duplicates=False, | |
disable_metadata_checks=False, | |
) | |
.config_indexing(not without_ft_index, language) | |
.config_verbose(verbose) | |
.config_nbworkers(threads) | |
) | |
if cluster_size: | |
zim_file.config_clustersize(cluster_size) | |
zim_file.config_metadata( | |
Name=name, | |
Language=language, | |
Title=title, | |
Description=description, | |
LongDescription=long_description, | |
Creator=creator, | |
Publisher=publisher, | |
Date=datetime.datetime.now(tz=datetime.UTC).date(), | |
Illustration_48x48_at_1=illustration_data, | |
Tags=";".join(tags.split(";")) if tags else None, | |
Scraper=scraper, | |
Flavour=flavour, | |
Source=source, | |
License=license_m, | |
Relation=relation, | |
) | |
zim_file.start() | |
try: | |
logger.debug(f"Preparing zimfile at {zim_file.filename}") | |
# recursively add content from build_dir | |
logger.debug(f"Recursively adding files from {build_dir}") | |
add_to_zim(build_dir, zim_file, build_dir) | |
if redirects_fpath: | |
logger.debug("Creating redirects") | |
add_redirects_to_zim(zim_file, redirects_file=redirects_fpath) | |
# prevents .finish() which would create an incomplete .zim file | |
# this would leave a .zim.tmp folder behind. | |
# UPSTREAM: wait until a proper cancel() is provided | |
except Exception: | |
zim_file.can_finish = False # pragma: no cover | |
raise | |
finally: | |
zim_file.finish() | |
def entrypoint(): | |
parser = argparse.ArgumentParser( | |
prog="zimwriterfs", | |
description="Create a ZIM file off a directory containing a static website", | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog="""Notes: | |
- openZIM Metadata: https://wiki.openzim.org/wiki/Metadata | |
- Set `ZIMSCRAPERLIB_DEBUG=1` environ to see all files included in ZIM""", | |
) | |
parser.add_argument("src_dir") | |
parser.add_argument("dst_path") | |
parser.add_argument( | |
"-V", | |
"--version", | |
help="Display version and exit", | |
action="version", | |
version=SCRAPER, | |
) | |
parser.add_argument( | |
"-n", | |
"--name", | |
help="Human identifier for the content (see spec)", | |
required=True, | |
) | |
parser.add_argument( | |
"-t", | |
"--title", | |
help=f"Title of the content ({RECOMMENDED_MAX_TITLE_LENGTH} chars max)", | |
required=True, | |
) | |
parser.add_argument( | |
"-l", | |
"--language", | |
help="ISO-639-3 Language code of the content", | |
required=True, | |
) | |
parser.add_argument( | |
"-d", | |
"--description", | |
help="Description of the content " | |
+ f"({MAXIMUM_DESCRIPTION_METADATA_LENGTH} chars max)", | |
required=True, | |
) | |
parser.add_argument( | |
"-w", | |
"--welcome", | |
help="Relative path to home/main page (ex: `home.html`)", | |
required=True, | |
) | |
parser.add_argument( | |
"-I", | |
"--illustration", | |
dest="illus_fname", | |
help="Relative path to the ZIM's illustration. Must be a 48x48px PNG", | |
required=True, | |
) | |
parser.add_argument( | |
"-c", | |
"--creator", | |
help="Creator of the Content", | |
required=True, | |
) | |
parser.add_argument( | |
"-p", | |
"--publisher", | |
help="Publisher of the Content", | |
required=True, | |
) | |
parser.add_argument( | |
"-r", | |
"--redirects", | |
dest="redirects_path", | |
help="Path to a TSV file containing a list of redirects (url title target_url)", | |
required=False, | |
) | |
parser.add_argument( | |
"-a", | |
"--tags", | |
help="ZIM Tags, semicolon separated", | |
default="", | |
required=False, | |
) | |
parser.add_argument( | |
"-e", | |
"--source", | |
help="Content source URL", | |
required=False, | |
) | |
parser.add_argument( | |
"-o", | |
"--flavour", | |
help="custom (version independent) content flavour", | |
required=False, | |
) | |
parser.add_argument( | |
"-s", | |
"--scraper", | |
help="custom (version independent) content flavour", | |
default=SCRAPER, | |
required=False, | |
) | |
parser.add_argument( | |
"--license", | |
help="License of the content", | |
dest="license_m", | |
required=False, | |
) | |
parser.add_argument( | |
"--relation", | |
help="URI of external related ressources ", | |
required=False, | |
) | |
parser.add_argument( | |
"-L", | |
"--longDescription", | |
dest="long_description", | |
help="Longer description of the content " | |
+ f"({MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH} chars max)", | |
required=False, | |
) | |
parser.add_argument( | |
"-v", | |
"--verbose", | |
help="Whether to display processing details", | |
action="store_true", | |
default=False, | |
required=False, | |
) | |
parser.add_argument( | |
"-m", | |
"--clusterSize", | |
dest="cluster_size", | |
type=int, | |
help="Number of bytes per ZIM cluster (libzim defaults to 2MiB)", | |
default=None, | |
required=False, | |
) | |
parser.add_argument( | |
"-J", | |
"--threads", | |
type=int, | |
default=4, | |
help="Number of threads/workers for libzim to use", | |
required=False, | |
) | |
parser.add_argument( | |
"-j", | |
"--withoutFTIndex", | |
dest="without_ft_index", | |
action="store_true", | |
default=False, | |
help="Don't create and add a fulltext index of the content to the ZIM", | |
required=False, | |
) | |
# parser.add_argument( | |
# "-x", | |
# "--inflateHtml", | |
# dest="inflate_html", | |
# action="store_true", | |
# default=False, | |
# help="[NOT IMPLEMENTED] try to inflate HTML files before packing", | |
# required=False, | |
# ) | |
# parser.add_argument( | |
# "--skip-libmagic-check", | |
# dest="skip_libmagic_check", | |
# action="store_true", | |
# default=False, | |
# help="Accept to run even if magic file cannot be loaded " | |
# + "(mimetypes in the zim file may be wrong)", | |
# required=False, | |
# ) | |
args = parser.parse_args() | |
try: | |
main(**dict(args._get_kwargs())) | |
except Exception as exc: | |
logger.exception(exc) | |
logger.error(f"FAILED. An error occured: {exc}") | |
raise SystemExit(1) from exc | |
if __name__ == "__main__": | |
entrypoint() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment