-
-
Save vgoklani/774ea6b8e2ba759eeec84a1471de2fda to your computer and use it in GitHub Desktop.
PDF rendering with pdf.js, from Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2023 mara004 | |
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 | |
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py | |
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge) | |
# Js-Depends: pdfjs-dist, canvas | |
# Use `python -m pip install` and `python -m javascript --install` | |
import argparse | |
from pathlib import Path | |
import PIL.Image | |
import javascript | |
# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error | |
pdfjs = javascript.require("pdfjs-dist") | |
libcanvas = javascript.require("canvas") | |
def render_pdf(input, outdir, scale): | |
pdf = pdfjs.getDocument(input).promise | |
n_pages = pdf.numPages | |
n_digits = len(str(n_pages)) | |
for i in range(1, n_pages+1): | |
page = pdf.getPage(i) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = int(viewport.width), int(viewport.height) | |
canvas = libcanvas.createCanvas(w, h) | |
context = canvas.getContext("2d") | |
page.render({"canvasContext": context, "viewport": viewport}).promise | |
# note that blobValueOf() is much faster than valueOf()["data"] for large byte buffers | |
js_buffer = canvas.toBuffer("raw") | |
py_buffer = js_buffer.blobValueOf() | |
pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_buffer, "raw", "BGRX", 0, 1) | |
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg") | |
pdf.destroy() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" + | |
"Known issues: - URL support is buggy; - certain PDFs may hit memory limits.", | |
) | |
path_type = lambda p: Path(p).expanduser().resolve() | |
input_type = lambda p: p if p.startswith("http") else str(path_type(p)) | |
parser.add_argument( | |
"input", type=input_type, | |
help="Input file path or URL.", | |
) | |
parser.add_argument("--outdir", "-o", type=path_type) | |
parser.add_argument("--scale", type=float, default=4) | |
args = parser.parse_args() | |
if not args.outdir.exists(): | |
args.outdir.mkdir(parents=True, exist_ok=True) | |
render_pdf(args.input, args.outdir, scale=args.scale) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment