Last active
April 2, 2025 00:41
-
-
Save mara004/8ef3a803531fdd42b29bbfa2889ff7f3 to your computer and use it in GitHub Desktop.
PDF rendering with Ghostscript (via subprocess)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2024 geisserml <[email protected]> | |
# SPDX-FileCopyrightText: 2024 James R. Barlow <[email protected]> | |
# SPDX-License-Identifier: MPL-2.0 | |
# Initial code derived from ocrmypdf/_exec/ghostscript.py | |
# Note that Ghostscript is AGPL-licensed. However, we are calling it via subprocess here, so not sure whether copyleft would actually apply. | |
# See also https://www.gnu.org/licenses/gpl-faq.en.html#MereAggregation | |
import io | |
import os | |
import re | |
import sys | |
import shutil | |
import logging | |
import subprocess | |
import PIL.Image | |
logger = logging.getLogger(__name__) | |
def get_ghostscript(): | |
# TODO consider searching the windows registry, as python-ghostscript and ocrmypdf do | |
# https://gitlab.com/pdftools/python-ghostscript/-/blob/9f84bf0e02f04eaad4bd998b9c5bef2be55e6389/ghostscript/_gsprint.py#L501 | |
# https://github.com/jbarlow83/OCRmyPDF/blob/master/src/ocrmypdf/subprocess/_windows.py | |
if sys.platform.startswith('win32'): | |
gs = shutil.which('gswin64c') | |
if not gs: | |
gs = shutil.which('gswin32c') | |
else: | |
gs = shutil.which('gs') | |
return gs | |
def _gs_error_reported(stream): | |
return bool( re.search('error', stream, flags=re.IGNORECASE) ) | |
def _gs_rasterise_pdf( | |
input_file, | |
*, | |
pageno, | |
raster_dpi, | |
password = None, | |
raster_device = 'png16m', | |
): | |
""" | |
Rasterize one page of a PDF at resolution *raster_dpi*. | |
*pageno* is the visual (1-based) page number. | |
Note that Ghostscript takes /UserUnit into account on its own. | |
""" | |
raster_dpi = round(raster_dpi, 6) | |
gs = get_ghostscript() | |
if not gs: | |
raise RuntimeError("Ghostscript could not be found. Make sure it is installed and added to $PATH.") | |
args_gs = [] | |
args_gs.extend( | |
[ | |
gs, | |
'-dQUIET', | |
'-dSAFER', | |
'-dBATCH', | |
'-dNOPAUSE', | |
'-dNOPROMPT', | |
f'-sDEVICE={raster_device}', | |
f'-dFirstPage={pageno}', | |
f'-dLastPage={pageno}', | |
f'-r{raster_dpi:f}x{raster_dpi:f}', | |
'-dTextAlphaBits=4', | |
'-dGraphicsAlphaBits=4', | |
'-dInterpolateControl=-1', | |
] | |
) | |
if password is not None: | |
args_gs.append(f'-sPDFPassword={password}') | |
args_gs.extend( | |
[ | |
'-o', | |
'-', | |
'-sstdout=%stderr', | |
'-dAutoRotatePages=/None', | |
'-f', | |
os.fspath(input_file), | |
] | |
) | |
#logger.debug(args_gs) | |
try: | |
pipe = subprocess.run( | |
args_gs, | |
stdout = subprocess.PIPE, | |
stderr = subprocess.PIPE, | |
check = True, | |
) | |
except subprocess.CalledProcessError as error_msg: | |
logger.error(error_msg.stderr.decode(errors='replace')) | |
raise RuntimeError('Ghostscript rasterizing failed') | |
else: | |
stderr = pipe.stderr.decode(errors='replace') | |
if _gs_error_reported(stderr): | |
logger.error(stderr) | |
return PIL.Image.open( io.BytesIO(pipe.stdout) ) | |
def invoke_ghostscript_shell(filepath, index, scale=4, password=None): | |
# Note, this does not handle rotation yet -> TODO | |
return _gs_rasterise_pdf( | |
filepath, | |
password = password, | |
pageno = index + 1, | |
raster_dpi = scale * 72, | |
) |
Also, multi-page rendering with -sPageList=pageranges
would be preferable to avoid launching a new subprocess for each page.
I stared working on this: https://gist.github.com/mara004/428a9aad5d553d4631ab0b5119eb74b2
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
TODO: see if we can call ghostscript through the ABI-level bindings from https://gitlab.com/pdftools/python-ghostscript
I seem to remember having already tried this at some point, a long time ago, but don't recall what exactly was the problem back then. I think it might have been that I didn't manage to get the in-memory data, and had to use file output anyway.
Yet, even that would already be preferable to avoid subprocess overhead, especially with repeated calls.