birkin · October 25, 2024 02:05
diff --git a/pdf_check_via_pikepdf.py b/pdf_check_via_pikepdf.py
 """
 Checks for invalid keys in the DecodeParms dictionary of images in a PDF file.
 -------

 Usage:
 - setup venv
 - install pikepdf
 % python ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf"

 ...or, just...

 % uv run ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf"

 - if you want to see what the versions and sources `uv` is using under-the-hood, add the verbosity flag:
  % uv -v run ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf"
 -------

 If invalid keys are found, the script will print a message like this:
 **Invalid keys in DecodeParms:** ``['/Colors', '/BitsPerComponent', '/Predictor', '/Columns']``
 -------

 The "///script..." block at the top of the file is recently adopted Python Enhancement Proposal, PEP-723.
    <https://peps.python.org/pep-0723/>
    It's a standard way to specify metadata that can be used by other tools; it won't interfere with running it traditionally.
 """

 # /// script
 # requires-python = "~=3.10.0"
 # dependencies = [
 #     "pikepdf~=9.3.0",
 # ]
 # ///

 import argparse
 import pikepdf  # <https://pikepdf.readthedocs.io/en/stable/>

 def inspect_decodeparms(pdf_path):
    with pikepdf.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            resources = page.get('/Resources', {})
            xobjects = resources.get('/XObject', {})
            for xobj_name, xobj_ref in xobjects.items():
                # print( f'xobj_name, ``{xobj_name}``' )
                # print( f'xobj_ref, ``{xobj_ref}``' )
                filters = xobj_ref.get('/Filter', [])
                if not isinstance(filters, list):
                    filters = [filters]
                if '/DCTDecode' in filters:
                    decode_parms = xobj_ref.get('/DecodeParms', {})
                    if decode_parms:
                        keys = list(decode_parms.keys())
                        # print( f'Page ``{page_num}``, XObject ``{xobj_name}``: DecodeParms keys: ``{keys}``' )
                        invalid_keys = [k for k in keys if k != '/ColorTransform']
                        if invalid_keys:
                            print( f'**Invalid keys in DecodeParms:** ``{invalid_keys}``' )

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Inspect DecodeParms in PDF images')
    parser.add_argument('--pdf_path', required=True, help='Path to the PDF file to inspect')
    args = parser.parse_args()
    inspect_decodeparms(args.pdf_path)
	"""
	Checks for invalid keys in the DecodeParms dictionary of images in a PDF file.
	-------

	Usage:
	- setup venv
	- install pikepdf
	% python ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf"

	...or, just...

	% uv run ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf"

	- if you want to see what the versions and sources `uv` is using under-the-hood, add the verbosity flag:
	% uv -v run ./pdf_check_via_pikepdf.py --pdf_path "/path/to/the.pdf"
	-------

	If invalid keys are found, the script will print a message like this:
	Invalid keys in DecodeParms: ``['/Colors', '/BitsPerComponent', '/Predictor', '/Columns']``
	-------

	The "///script..." block at the top of the file is recently adopted Python Enhancement Proposal, PEP-723.
	<https://peps.python.org/pep-0723/>
	It's a standard way to specify metadata that can be used by other tools; it won't interfere with running it traditionally.
	"""

	# /// script
	# requires-python = "~=3.10.0"
	# dependencies = [
	# "pikepdf~=9.3.0",
	# ]
	# ///

	import argparse
	import pikepdf # <https://pikepdf.readthedocs.io/en/stable/>

	def inspect_decodeparms(pdf_path):
	with pikepdf.open(pdf_path) as pdf:
	for page_num, page in enumerate(pdf.pages, start=1):
	resources = page.get('/Resources', {})
	xobjects = resources.get('/XObject', {})
	for xobj_name, xobj_ref in xobjects.items():
	# print( f'xobj_name, ``{xobj_name}``' )
	# print( f'xobj_ref, ``{xobj_ref}``' )
	filters = xobj_ref.get('/Filter', [])
	if not isinstance(filters, list):
	filters = [filters]
	if '/DCTDecode' in filters:
	decode_parms = xobj_ref.get('/DecodeParms', {})
	if decode_parms:
	keys = list(decode_parms.keys())
	# print( f'Page ``{page_num}``, XObject ``{xobj_name}``: DecodeParms keys: ``{keys}``' )
	invalid_keys = [k for k in keys if k != '/ColorTransform']
	if invalid_keys:
	print( f'Invalid keys in DecodeParms: ``{invalid_keys}``' )

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Inspect DecodeParms in PDF images')
	parser.add_argument('--pdf_path', required=True, help='Path to the PDF file to inspect')
	args = parser.parse_args()
	inspect_decodeparms(args.pdf_path)