Last active
October 4, 2024 05:06
-
-
Save mara004/87276da4f8be31c80c38036c6ab667d7 to your computer and use it in GitHub Desktop.
PDF rendering with pdf.js, from Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 | |
# This is an experimental pdf.js interface using shared memory. | |
# Unfortunately, shm-typed-array does not support Windows, so this is not exactly portable. | |
# For an older version by the same author that uses pipe-based data transfer via JSPyBridge's .blobValueOf(), see the link below: | |
# https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py | |
# Py-Depends: pillow, javascript (JSPyBridge), posix_ipc | |
# Js-Depends: pdfjs-dist, canvas, shm-typed-array | |
# You can use `python -m pip install`, and `python -m javascript --install` | |
# NOTE This currently assumes you have a custom pdf.js build in the same directory as this file, because require("pdfjs-dist") appears broken on the author's nodejs 20. See upstream build instructions. Commit 8b50836d is confirmed to work. Patch the require() calls if you want otherwise. | |
import time | |
starttm = time.time() | |
import mmap | |
import argparse | |
from pathlib import Path | |
# third-party | |
import PIL.Image | |
import javascript | |
import posix_ipc | |
THIS_DIR = str(Path(__file__).resolve().parent) | |
# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error | |
pdfjs = javascript.require( str(THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")) ) | |
libcanvas = javascript.require( str(THIS_DIR / Path("pdf.js/node_modules/canvas")) ) | |
libshm = javascript.require("shm-typed-array") | |
print(f"Imports took {time.time() - starttm}s"); del starttm | |
def render_pdf(input, outdir, scale): | |
pdf = pdfjs.getDocument(input).promise | |
n_pages = pdf.numPages | |
n_digits = len(str(n_pages)) | |
starttm = time.time() | |
sizes = [] | |
for i in range(n_pages): | |
page = pdf.getPage(i+1) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = int(viewport.width), int(viewport.height) | |
sizes.append( (w, h) ) | |
max_alloc = max(w*h for w, h in sizes) * 4 | |
print(f"Shared memory size in bytes: {max_alloc} (took {time.time() - starttm}s to determine)"); del starttm | |
memkey = "/pypdfjs_render_shm" | |
js_shm = libshm.create(max_alloc, "Buffer", memkey) | |
assert js_shm is not None, "Shared memory of this name already exists, go to /dev/shm and remove it." | |
py_shm_handle = posix_ipc.SharedMemory(memkey) | |
try: | |
py_shm = mmap.mmap(py_shm_handle.fd, py_shm_handle.size) | |
for i in range(n_pages): | |
page = pdf.getPage(i+1) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = sizes[i] | |
canvas = libcanvas.createCanvas(w, h) | |
context = canvas.getContext("2d") | |
page.render({"canvasContext": context, "viewport": viewport}).promise | |
# the author is not aware of a way to create a canvas backed by an external buffer, so this copies | |
js_buffer = canvas.toBuffer("raw") | |
starttm = time.time() | |
js_buffer.copy(js_shm) | |
print(f"Data transfer took {time.time() - starttm}s") | |
pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_shm, "raw", "BGRX", 0, 1) | |
py_shm.seek(0) | |
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg") | |
finally: | |
# Need to use native (non-js) functions to reliably destroy shared memory. Bridge seems to break in case of KeyboardInterrupt. | |
py_shm_handle.close_fd() | |
py_shm_handle.unlink() | |
assert not Path("/dev/shm" + memkey).exists() | |
pdf.destroy() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" + | |
"Known issues: - URL support is buggy; - certain PDFs may hit memory limits.", | |
) | |
path_type = lambda p: Path(p).expanduser().resolve() | |
input_type = lambda p: p if p.startswith("http") else str(path_type(p)) | |
parser.add_argument( | |
"input", type=input_type, | |
help="Input file path or URL.", | |
) | |
parser.add_argument("--outdir", "-o", type=path_type, required=True) | |
parser.add_argument("--scale", type=float, default=4) | |
args = parser.parse_args() | |
if not args.outdir.exists(): | |
args.outdir.mkdir(parents=True, exist_ok=True) | |
render_pdf(args.input, args.outdir, scale=args.scale) | |
main() |
To put this in performance relation: This script takes ~2min to render the CinelerraGG user manual (~690p) at scale 4 (~300dpi).
pypdfium2 (in linear mode) manages this ~45s. And in parallel (with 4 processes) it's ~20s.
However, I'm not sure to which part this may be due to pdf.js itself, or due to bridging complications. I suspect the latter still has a significant impact.
The shared memory situation on the JS side is displeasing, to say the least.
Maybe https://github.com/kyr0/node-libsharedmemory would be a more portable alternative to shm-typed-array
...
On the other hand, it doesn't provide a memory view that reflects changes, only a stream read/write interface.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Historical note: the author created extremeheat/JSPyBridge#103 (
blobValueOf()
) on behalf of this use case.