Last active
April 4, 2025 00:02
-
-
Save mara004/51c3216a9eabd3dcbc78a86d877a61dc to your computer and use it in GitHub Desktop.
PDF rendering with PDFBox, from Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Four lines intentionally left blank | |
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0 | |
# Assuming you have an Apache PDFBox 3 jar in the same directory | |
from pathlib import Path | |
import jpype | |
import jpype.imports | |
import PIL.Image | |
RELATIVE_DIR = Path(__file__).resolve().parent | |
PDFBOX_JAR = RELATIVE_DIR / "pdfbox.jar" | |
# TODO defer JVM startup and imports to first demand? | |
jpype.addClassPath(PDFBOX_JAR) | |
jpype.startJVM("-Djava.awt.headless=true") | |
import java.io as jio | |
import org.apache.pdfbox as pdfbox | |
from org.apache.pdfbox.rendering import ImageType | |
DPI = 300 | |
TEST_FILE = Path("~/projects/scripts/out/38.pdf").expanduser() | |
OUTPUT_DIR = RELATIVE_DIR / "out" | |
OUTPUT_DIR.mkdir(exist_ok=True) | |
ImageTypeToPIL = { | |
ImageType.BINARY: "1", # TYPE_BYTE_BINARY | |
ImageType.GRAY: "L", # TYPE_BYTE_GRAY | |
ImageType.RGB: "BGRX", # TYPE_INT_RGB, actually BGRX in memory | |
ImageType.ARGB: "BGRA", # TYPE_INT_ARGB, actually BGRA in memory | |
ImageType.BGR: "BGR", # TYPE_3BYTE_BGR | |
} | |
imgtype = ImageType.BGR | |
pil_srcmode = ImageTypeToPIL[imgtype] | |
pil_dstmode = pil_srcmode.replace("BGR", "RGB") | |
pdf = pdfbox.Loader.loadPDF(jio.File( str(TEST_FILE) )) | |
renderer = pdfbox.rendering.PDFRenderer(pdf) | |
n_pages = int( pdf.getNumberOfPages() ) | |
for i in range(n_pages): | |
print(f"Rendering page {i+1} ...") | |
j_image = renderer.renderImageWithDPI(i, DPI, imgtype) | |
w, h = int(j_image.getWidth()), int(j_image.getHeight()) | |
j_data = j_image.getRaster().getDataBuffer().getData() | |
py_data = memoryview(j_data) | |
# passing a memoryview requires PIL >= 9.5 | |
py_image = PIL.Image.frombuffer(pil_dstmode, (w, h), py_data, "raw", pil_srcmode, 0, 1) | |
py_image.save(OUTPUT_DIR / f"render_{i+1}.jpg") | |
jpype.shutdownJVM() # TODO atexit.register |
todo: might theoretically want to handle offset, though it always seems to be 0 in practice, as per my testing
also want to look into different input types (bytes, byte buffer), and parallelization, as would be relevant in a generic API
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Actually, I see now there has been some independent discussion on this already, and a kind of similar gist written by @lebedov:
lebedov/python-pdfbox#10
https://gist.github.com/lebedov/cefbec588c2b2bd0251ba505bd8bf933
https://gist.github.com/lebedov/3518142a5c2431b8c9a28d323100558a
Our code above uses a slightly different approach for data transfer, though (accessing the underlying buffer directly instead of calling
getRGB()
), and the color space handling is new, too. Moreover, we use the newer v3 API of pdfbox, while the previous examples were yet written with pdfbox v2.