Last active
April 4, 2025 00:02
-
-
Save mara004/51c3216a9eabd3dcbc78a86d877a61dc to your computer and use it in GitHub Desktop.
PDF rendering with PDFBox, from Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Four lines intentionally left blank | |
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0 | |
# Assuming you have an Apache PDFBox 3 jar in the same directory | |
from pathlib import Path | |
import jpype | |
import jpype.imports | |
import PIL.Image | |
RELATIVE_DIR = Path(__file__).resolve().parent | |
PDFBOX_JAR = RELATIVE_DIR / "pdfbox.jar" | |
# TODO defer JVM startup and imports to first demand? | |
jpype.addClassPath(PDFBOX_JAR) | |
jpype.startJVM("-Djava.awt.headless=true") | |
import java.io as jio | |
import org.apache.pdfbox as pdfbox | |
from org.apache.pdfbox.rendering import ImageType | |
DPI = 300 | |
TEST_FILE = Path("~/projects/scripts/out/38.pdf").expanduser() | |
OUTPUT_DIR = RELATIVE_DIR / "out" | |
OUTPUT_DIR.mkdir(exist_ok=True) | |
ImageTypeToPIL = { | |
ImageType.BINARY: "1", # TYPE_BYTE_BINARY | |
ImageType.GRAY: "L", # TYPE_BYTE_GRAY | |
ImageType.RGB: "BGRX", # TYPE_INT_RGB, actually BGRX in memory | |
ImageType.ARGB: "BGRA", # TYPE_INT_ARGB, actually BGRA in memory | |
ImageType.BGR: "BGR", # TYPE_3BYTE_BGR | |
} | |
imgtype = ImageType.BGR | |
pil_srcmode = ImageTypeToPIL[imgtype] | |
pil_dstmode = pil_srcmode.replace("BGR", "RGB") | |
pdf = pdfbox.Loader.loadPDF(jio.File( str(TEST_FILE) )) | |
renderer = pdfbox.rendering.PDFRenderer(pdf) | |
n_pages = int( pdf.getNumberOfPages() ) | |
for i in range(n_pages): | |
print(f"Rendering page {i+1} ...") | |
j_image = renderer.renderImageWithDPI(i, DPI, imgtype) | |
w, h = int(j_image.getWidth()), int(j_image.getHeight()) | |
j_data = j_image.getRaster().getDataBuffer().getData() | |
py_data = memoryview(j_data) | |
# passing a memoryview requires PIL >= 9.5 | |
py_image = PIL.Image.frombuffer(pil_dstmode, (w, h), py_data, "raw", pil_srcmode, 0, 1) | |
py_image.save(OUTPUT_DIR / f"render_{i+1}.jpg") | |
jpype.shutdownJVM() # TODO atexit.register |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
todo: might theoretically want to handle offset, though it always seems to be 0 in practice, as per my testing
also want to look into different input types (bytes, byte buffer), and parallelization, as would be relevant in a generic API