-
-
Save vanschnapen/4707a576e822fe4c7349572779311f49 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """anonymize-pdf: Anonymize document and annotation metadata in PDFs. | |
| """ | |
| import argparse | |
| try: | |
| # check if pymupdf is importable (installed) | |
| import pymupdf | |
| except ImportError as e: | |
| # if not importable, try to install pymupdf | |
| import subprocess | |
| install_string = "pip install PyMuPDF" | |
| # install_string = "pip install --upgrade PyMuPDF" # alternative prompt, due to recommendation from documentation | |
| # https://pymupdf.readthedocs.io/en/latest/installation.html#id1 | |
| print("Trying to install required package PyMuPDF by '"+install_string+"'") | |
| subprocess.call(install_string.split()) | |
| print("Please restart the script!") | |
| raise | |
| # parse argvalues from commandline | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("infile", | |
| help="path to input pdf file (infile.pdf)") | |
| parser.add_argument("outfile", nargs="?", | |
| help="path to output pdf file (outfile.pdf)", default=None) | |
| parser.add_argument("-v", "--verbose", help="display metadata before and after anonymization", | |
| action="store_true") | |
| args = parser.parse_args() | |
| # set filenames from argument parser and set output filename based on input filename if no output filename is given | |
| infile = args.infile | |
| if args.outfile: | |
| outfile = args.outfile | |
| else: | |
| outfile = infile.replace('.pdf', '.anonymized.pdf') | |
| print("Using '" + outfile + "' as output filename for inputfile '" + infile + "'") | |
| # open the document with pymupdf | |
| doc = pymupdf.open(infile) | |
| metadata = doc.metadata | |
| # remove all metadata of the document except of fields specified in 'do_not_remove' | |
| METADATA_FIELDS_TO_KEEP = ['format',] | |
| if args.verbose: | |
| print("document metadata before anonymization:\n", metadata.items()) | |
| for k, v in metadata.items(): | |
| if k not in METADATA_FIELDS_TO_KEEP: # select all except of those in keep-list | |
| metadata[k] = '' | |
| if args.verbose: | |
| print("Document metadata after anonymization. You might check for retained infos:\n", metadata.items()) | |
| doc.set_metadata(metadata) | |
| # remove all specified fields from all annotations on all pages | |
| ANNOTATION_FIELDS_TO_KEEP = ['content'] | |
| ANNOTATION_FIELD_DEFAULTS = {'modDate': "D:20000000000000-00'00'", | |
| 'other': 'X'} | |
| if args.verbose: | |
| print("\nAnnotation metadata before anonymization:") | |
| for page in doc: | |
| for annot in page.annots(): | |
| print(annot.info) | |
| for page in doc: | |
| for annot in page.annots(): | |
| info = annot.info | |
| if args.verbose: | |
| print(annot.info) | |
| for k, v in info.items(): | |
| # remove all annotation info | |
| # except of those in keep-list | |
| if k not in ANNOTATION_FIELDS_TO_KEEP: | |
| info[k] = ANNOTATION_FIELD_DEFAULTS.get( | |
| k, ANNOTATION_FIELD_DEFAULTS['other']) | |
| annot.set_info(info) | |
| # annot.update() # can be omitted for annot.set_info() according to | |
| # https://pymupdf.readthedocs.io/en/latest/annot.html#Annot.update | |
| if args.verbose: | |
| print("\nAnnotation metadata after anonymization. You might check for retained infos:") | |
| for page in doc: | |
| for annot in page.annots(): | |
| print(annot.info) | |
| # finally save document | |
| doc.save(outfile) | |
| print("\nSuccessfully saved an anonymized output to '" + outfile + "'!") |
| MIT License | |
| Copyright for portions of this fork are held by Vikas Dhiman, 2024, as part | |
| of project https://gist.github.com/wecacuee/f1e92d421312b7c7c1907667f4f3a318. | |
| All other copyright for this fork are held by github.com/vanschnapen, 2024. | |
| Permission is hereby granted, free of charge, to any person obtaining a copy | |
| of this software and associated documentation files (the "Software"), to deal | |
| in the Software without restriction, including without limitation the rights | |
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| copies of the Software, and to permit persons to whom the Software is | |
| furnished to do so, subject to the following conditions: | |
| The above copyright notice and this permission notice shall be included in all | |
| copies or substantial portions of the Software. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| SOFTWARE. |
Thanks @nylander for the comments on the gist!
Typo in line nr 12:
prnint("Please restart the script!").
I guess this part of the code never got checked once the necessary library was installed.
Indeed, I run in this error once, but forgot to revise it in my fork. It is revised now, as well as the full argument parsing.
[...] Furthermore, there are new recommendations for installing the
pymupdflibrary (PyMuPDF documentation):pip install --upgrade pymupdf.
Actually, I don't get the reason for an --upgrade flag for an initial pip install of a package that was not importable by the script and, hence, is assumed to be not installed. Additionally, if a local candidate is present but not installed, I don't see a reason for forcing a user to fetch a new candidate (see pip documentation) However, I revised this part of the script and added a string variable that could easily by swapped by the user if --upgrade flag is desired.
Typo in line nr 12:
prnint("Please restart the script!").I guess this part of the code never got checked once the necessary library was installed.
Furthermore, there are new recommendations for installing the
pymupdflibrary (PyMuPDF documentation):pip install --upgrade pymupdf.