vanschnapen · November 11, 2024 15:45 · nylander · Nov 9, 2024 · vanschnapen · Nov 11, 2024
diff --git a/anonymize-pdf.py b/anonymize-pdf.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 """anonymize-pdf: Anonymize document and annotation metadata in PDFs.
 """

 import argparse

 try:
    # check if pymupdf is importable (installed)
    import pymupdf
 except ImportError as e:
    # if not importable, try to install pymupdf
    import subprocess
    install_string = "pip install PyMuPDF"
    # install_string = "pip install --upgrade PyMuPDF"  # alternative prompt, due to recommendation from documentation
                                                        # https://pymupdf.readthedocs.io/en/latest/installation.html#id1
    print("Trying to install required package PyMuPDF by '"+install_string+"'")
    subprocess.call(install_string.split())
    print("Please restart the script!")
    raise

 # parse argvalues from commandline
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument("infile",
                    help="path to input pdf file (infile.pdf)")
 parser.add_argument("outfile", nargs="?",
                    help="path to output pdf file (outfile.pdf)", default=None)
 parser.add_argument("-v", "--verbose", help="display metadata before and after anonymization",
                    action="store_true")

 args = parser.parse_args()

 # set filenames from argument parser and set output filename based on input filename if no output filename is given
 infile = args.infile
 if args.outfile:
    outfile = args.outfile
 else:
    outfile = infile.replace('.pdf', '.anonymized.pdf')

 print("Using '" + outfile + "' as output filename for inputfile '" + infile + "'")

 # open the document with pymupdf
 doc = pymupdf.open(infile)
 metadata = doc.metadata

 # remove all metadata of the document except of fields specified in 'do_not_remove'
 METADATA_FIELDS_TO_KEEP = ['format',]
 if args.verbose:
    print("document metadata before anonymization:\n", metadata.items())

 for k, v in metadata.items():
    if k not in METADATA_FIELDS_TO_KEEP:  # select all except of those in keep-list
        metadata[k] = ''

 if args.verbose:
    print("Document metadata after anonymization. You might check for retained infos:\n", metadata.items())
 doc.set_metadata(metadata)

 # remove all specified fields from all annotations on all pages
 ANNOTATION_FIELDS_TO_KEEP = ['content']
 ANNOTATION_FIELD_DEFAULTS = {'modDate': "D:20000000000000-00'00'",
                             'other': 'X'}
 if args.verbose:
    print("\nAnnotation metadata before anonymization:")
    for page in doc:
        for annot in page.annots():
            print(annot.info)

 for page in doc:
    for annot in page.annots():
        info = annot.info
        if args.verbose:
            print(annot.info)
        for k, v in info.items():
            # remove all annotation info 
            # except of those in keep-list
            if k not in ANNOTATION_FIELDS_TO_KEEP:
                info[k] = ANNOTATION_FIELD_DEFAULTS.get(
                        k, ANNOTATION_FIELD_DEFAULTS['other'])
        annot.set_info(info)
        # annot.update()    # can be omitted for annot.set_info() according to
                            # https://pymupdf.readthedocs.io/en/latest/annot.html#Annot.update

 if args.verbose:
    print("\nAnnotation metadata after anonymization. You might check for retained infos:")
    for page in doc:
        for annot in page.annots():
            print(annot.info)

 # finally save document
 doc.save(outfile)
 print("\nSuccessfully saved an anonymized output to '" + outfile + "'!")
diff --git a/license b/license
 MIT License

 Copyright for portions of this fork are held by Vikas Dhiman, 2024, as part 
 of project https://gist.github.com/wecacuee/f1e92d421312b7c7c1907667f4f3a318.
 All other copyright for this fork are held by github.com/vanschnapen, 2024.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	"""anonymize-pdf: Anonymize document and annotation metadata in PDFs.
	"""

	import argparse

	try:
	# check if pymupdf is importable (installed)
	import pymupdf
	except ImportError as e:
	# if not importable, try to install pymupdf
	import subprocess
	install_string = "pip install PyMuPDF"
	# install_string = "pip install --upgrade PyMuPDF" # alternative prompt, due to recommendation from documentation
	# https://pymupdf.readthedocs.io/en/latest/installation.html#id1
	print("Trying to install required package PyMuPDF by '"+install_string+"'")
	subprocess.call(install_string.split())
	print("Please restart the script!")
	raise

	# parse argvalues from commandline
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("infile",
	help="path to input pdf file (infile.pdf)")
	parser.add_argument("outfile", nargs="?",
	help="path to output pdf file (outfile.pdf)", default=None)
	parser.add_argument("-v", "--verbose", help="display metadata before and after anonymization",
	action="store_true")

	args = parser.parse_args()

	# set filenames from argument parser and set output filename based on input filename if no output filename is given
	infile = args.infile
	if args.outfile:
	outfile = args.outfile
	else:
	outfile = infile.replace('.pdf', '.anonymized.pdf')

	print("Using '" + outfile + "' as output filename for inputfile '" + infile + "'")

	# open the document with pymupdf
	doc = pymupdf.open(infile)
	metadata = doc.metadata

	# remove all metadata of the document except of fields specified in 'do_not_remove'
	METADATA_FIELDS_TO_KEEP = ['format',]
	if args.verbose:
	print("document metadata before anonymization:\n", metadata.items())

	for k, v in metadata.items():
	if k not in METADATA_FIELDS_TO_KEEP: # select all except of those in keep-list
	metadata[k] = ''

	if args.verbose:
	print("Document metadata after anonymization. You might check for retained infos:\n", metadata.items())
	doc.set_metadata(metadata)

	# remove all specified fields from all annotations on all pages
	ANNOTATION_FIELDS_TO_KEEP = ['content']
	ANNOTATION_FIELD_DEFAULTS = {'modDate': "D:20000000000000-00'00'",
	'other': 'X'}
	if args.verbose:
	print("\nAnnotation metadata before anonymization:")
	for page in doc:
	for annot in page.annots():
	print(annot.info)

	for page in doc:
	for annot in page.annots():
	info = annot.info
	if args.verbose:
	print(annot.info)
	for k, v in info.items():
	# remove all annotation info
	# except of those in keep-list
	if k not in ANNOTATION_FIELDS_TO_KEEP:
	info[k] = ANNOTATION_FIELD_DEFAULTS.get(
	k, ANNOTATION_FIELD_DEFAULTS['other'])
	annot.set_info(info)
	# annot.update() # can be omitted for annot.set_info() according to
	# https://pymupdf.readthedocs.io/en/latest/annot.html#Annot.update

	if args.verbose:
	print("\nAnnotation metadata after anonymization. You might check for retained infos:")
	for page in doc:
	for annot in page.annots():
	print(annot.info)

	# finally save document
	doc.save(outfile)
	print("\nSuccessfully saved an anonymized output to '" + outfile + "'!")
	MIT License

	Copyright for portions of this fork are held by Vikas Dhiman, 2024, as part
	of project https://gist.github.com/wecacuee/f1e92d421312b7c7c1907667f4f3a318.
	All other copyright for this fork are held by github.com/vanschnapen, 2024.

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.