mara004 · October 4, 2024 05:06 · mara004 · Sep 11, 2024
diff --git a/pypdfjs.py b/pypdfjs.py
 # SPDX-FileCopyrightText: 2024 geisserml <[email protected]>
 # SPDX-License-Identifier: Apache-2.0

 # This is an experimental pdf.js interface using shared memory.
 # Unfortunately, shm-typed-array does not support Windows, so this is not exactly portable.
 # For an older version by the same author that uses pipe-based data transfer via JSPyBridge's .blobValueOf(), see the link below:
 # https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py

 # Py-Depends: pillow, javascript (JSPyBridge), posix_ipc
 # Js-Depends: pdfjs-dist, canvas, shm-typed-array
 # You can use `python -m pip install`, and `python -m javascript --install`
 # NOTE This currently assumes you have a custom pdf.js build in the same directory as this file, because require("pdfjs-dist") appears broken on the author's nodejs 20. See upstream build instructions. Commit 8b50836d is confirmed to work. Patch the require() calls if you want otherwise.

 import time

 starttm = time.time()
 import mmap
 import argparse
 from pathlib import Path

 # third-party
 import PIL.Image
 import javascript
 import posix_ipc

 THIS_DIR = str(Path(__file__).resolve().parent)

 # NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error
 pdfjs = javascript.require( str(THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")) )
 libcanvas = javascript.require( str(THIS_DIR / Path("pdf.js/node_modules/canvas")) )
 libshm = javascript.require("shm-typed-array")

 print(f"Imports took {time.time() - starttm}s"); del starttm


 def render_pdf(input, outdir, scale):
    
    pdf = pdfjs.getDocument(input).promise
    n_pages = pdf.numPages
    n_digits = len(str(n_pages))
    
    starttm = time.time()
    sizes = []
    for i in range(n_pages):
        page = pdf.getPage(i+1)
        viewport = page.getViewport({"scale": scale})
        w, h = int(viewport.width), int(viewport.height)
        sizes.append( (w, h) )
    
    max_alloc = max(w*h for w, h in sizes) * 4
    print(f"Shared memory size in bytes: {max_alloc} (took {time.time() - starttm}s to determine)"); del starttm
    
    memkey = "/pypdfjs_render_shm"
    js_shm = libshm.create(max_alloc, "Buffer", memkey)
    assert js_shm is not None, "Shared memory of this name already exists, go to /dev/shm and remove it."
    py_shm_handle = posix_ipc.SharedMemory(memkey)
    
    try:
        py_shm = mmap.mmap(py_shm_handle.fd, py_shm_handle.size)
        for i in range(n_pages):
            
            page = pdf.getPage(i+1)
            viewport = page.getViewport({"scale": scale})
            w, h = sizes[i]
            
            canvas = libcanvas.createCanvas(w, h)
            context = canvas.getContext("2d")
            page.render({"canvasContext": context, "viewport": viewport}).promise
            
            # the author is not aware of a way to create a canvas backed by an external buffer, so this copies
            js_buffer = canvas.toBuffer("raw")
            starttm = time.time()
            js_buffer.copy(js_shm)
            print(f"Data transfer took {time.time() - starttm}s")
            
            pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_shm, "raw", "BGRX", 0, 1)
            py_shm.seek(0)
            pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg")
    finally:
        # Need to use native (non-js) functions to reliably destroy shared memory. Bridge seems to break in case of KeyboardInterrupt.
        py_shm_handle.close_fd()
        py_shm_handle.unlink()
        assert not Path("/dev/shm" + memkey).exists()
    
    pdf.destroy()


 def main():
    
    parser = argparse.ArgumentParser(
        description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" +
        "Known issues: - URL support is buggy; - certain PDFs may hit memory limits.",
    )
    path_type = lambda p: Path(p).expanduser().resolve()
    input_type = lambda p: p if p.startswith("http") else str(path_type(p))
    parser.add_argument(
        "input", type=input_type,
        help="Input file path or URL.",
    )
    parser.add_argument("--outdir", "-o", type=path_type, required=True)
    parser.add_argument("--scale", type=float, default=4)
    
    args = parser.parse_args()
    if not args.outdir.exists():
        args.outdir.mkdir(parents=True, exist_ok=True)
    
    render_pdf(args.input, args.outdir, scale=args.scale)


 main()
	# SPDX-FileCopyrightText: 2024 geisserml <[email protected]>
	# SPDX-License-Identifier: Apache-2.0

	# This is an experimental pdf.js interface using shared memory.
	# Unfortunately, shm-typed-array does not support Windows, so this is not exactly portable.
	# For an older version by the same author that uses pipe-based data transfer via JSPyBridge's .blobValueOf(), see the link below:
	# https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py

	# Py-Depends: pillow, javascript (JSPyBridge), posix_ipc
	# Js-Depends: pdfjs-dist, canvas, shm-typed-array
	# You can use `python -m pip install`, and `python -m javascript --install`
	# NOTE This currently assumes you have a custom pdf.js build in the same directory as this file, because require("pdfjs-dist") appears broken on the author's nodejs 20. See upstream build instructions. Commit 8b50836d is confirmed to work. Patch the require() calls if you want otherwise.

	import time

	starttm = time.time()
	import mmap
	import argparse
	from pathlib import Path

	# third-party
	import PIL.Image
	import javascript
	import posix_ipc

	THIS_DIR = str(Path(__file__).resolve().parent)

	# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error
	pdfjs = javascript.require( str(THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")) )
	libcanvas = javascript.require( str(THIS_DIR / Path("pdf.js/node_modules/canvas")) )
	libshm = javascript.require("shm-typed-array")

	print(f"Imports took {time.time() - starttm}s"); del starttm


	def render_pdf(input, outdir, scale):

	pdf = pdfjs.getDocument(input).promise
	n_pages = pdf.numPages
	n_digits = len(str(n_pages))

	starttm = time.time()
	sizes = []
	for i in range(n_pages):
	page = pdf.getPage(i+1)
	viewport = page.getViewport({"scale": scale})
	w, h = int(viewport.width), int(viewport.height)
	sizes.append( (w, h) )

	max_alloc = max(wh for w, h in sizes) 4
	print(f"Shared memory size in bytes: {max_alloc} (took {time.time() - starttm}s to determine)"); del starttm

	memkey = "/pypdfjs_render_shm"
	js_shm = libshm.create(max_alloc, "Buffer", memkey)
	assert js_shm is not None, "Shared memory of this name already exists, go to /dev/shm and remove it."
	py_shm_handle = posix_ipc.SharedMemory(memkey)

	try:
	py_shm = mmap.mmap(py_shm_handle.fd, py_shm_handle.size)
	for i in range(n_pages):

	page = pdf.getPage(i+1)
	viewport = page.getViewport({"scale": scale})
	w, h = sizes[i]

	canvas = libcanvas.createCanvas(w, h)
	context = canvas.getContext("2d")
	page.render({"canvasContext": context, "viewport": viewport}).promise

	# the author is not aware of a way to create a canvas backed by an external buffer, so this copies
	js_buffer = canvas.toBuffer("raw")
	starttm = time.time()
	js_buffer.copy(js_shm)
	print(f"Data transfer took {time.time() - starttm}s")

	pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_shm, "raw", "BGRX", 0, 1)
	py_shm.seek(0)
	pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg")
	finally:
	# Need to use native (non-js) functions to reliably destroy shared memory. Bridge seems to break in case of KeyboardInterrupt.
	py_shm_handle.close_fd()
	py_shm_handle.unlink()
	assert not Path("/dev/shm" + memkey).exists()

	pdf.destroy()


	def main():

	parser = argparse.ArgumentParser(
	description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" +
	"Known issues: - URL support is buggy; - certain PDFs may hit memory limits.",
	)
	path_type = lambda p: Path(p).expanduser().resolve()
	input_type = lambda p: p if p.startswith("http") else str(path_type(p))
	parser.add_argument(
	"input", type=input_type,
	help="Input file path or URL.",
	)
	parser.add_argument("--outdir", "-o", type=path_type, required=True)
	parser.add_argument("--scale", type=float, default=4)

	args = parser.parse_args()
	if not args.outdir.exists():
	args.outdir.mkdir(parents=True, exist_ok=True)

	render_pdf(args.input, args.outdir, scale=args.scale)


	main()