Last active
October 25, 2024 02:05
-
-
Save birkin/82e6212ef0b7388b4afd049ed705b13f to your computer and use it in GitHub Desktop.
inspect PDF DecodeParms dict
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Checks for invalid keys in the DecodeParms dictionary of images in a PDF file. | |
------- | |
Usage: | |
- setup venv | |
- install pikepdf | |
% python ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf" | |
...or, just... | |
% uv run ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf" | |
- if you want to see what the versions and sources `uv` is using under-the-hood, add the verbosity flag: | |
% uv -v run ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf" | |
------- | |
If invalid keys are found, the script will print a message like this: | |
**Invalid keys in DecodeParms:** ``['/Colors', '/BitsPerComponent', '/Predictor', '/Columns']`` | |
------- | |
The "///script..." block at the top of the file is recently adopted Python Enhancement Proposal, PEP-723. | |
<https://peps.python.org/pep-0723/> | |
It's a standard way to specify metadata that can be used by other tools; it won't interfere with running it traditionally. | |
""" | |
# /// script | |
# requires-python = "~=3.10.0" | |
# dependencies = [ | |
# "pikepdf~=9.3.0", | |
# ] | |
# /// | |
import argparse | |
import pikepdf # <https://pikepdf.readthedocs.io/en/stable/> | |
def inspect_decodeparms(pdf_path): | |
with pikepdf.open(pdf_path) as pdf: | |
for page_num, page in enumerate(pdf.pages, start=1): | |
resources = page.get('/Resources', {}) | |
xobjects = resources.get('/XObject', {}) | |
for xobj_name, xobj_ref in xobjects.items(): | |
# print( f'xobj_name, ``{xobj_name}``' ) | |
# print( f'xobj_ref, ``{xobj_ref}``' ) | |
filters = xobj_ref.get('/Filter', []) | |
if not isinstance(filters, list): | |
filters = [filters] | |
if '/DCTDecode' in filters: | |
decode_parms = xobj_ref.get('/DecodeParms', {}) | |
if decode_parms: | |
keys = list(decode_parms.keys()) | |
# print( f'Page ``{page_num}``, XObject ``{xobj_name}``: DecodeParms keys: ``{keys}``' ) | |
invalid_keys = [k for k in keys if k != '/ColorTransform'] | |
if invalid_keys: | |
print( f'**Invalid keys in DecodeParms:** ``{invalid_keys}``' ) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Inspect DecodeParms in PDF images') | |
parser.add_argument('--pdf_path', required=True, help='Path to the PDF file to inspect') | |
args = parser.parse_args() | |
inspect_decodeparms(args.pdf_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment