-
-
Save vanschnapen/4707a576e822fe4c7349572779311f49 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
"""anonymize-pdf: Anonymize document and annotation metadata in PDFs. | |
""" | |
import argparse | |
try: | |
# check if pymupdf is importable (installed) | |
import pymupdf | |
except ImportError as e: | |
# if not importable, try to install pymupdf | |
import subprocess | |
install_string = "pip install PyMuPDF" | |
# install_string = "pip install --upgrade PyMuPDF" # alternative prompt, due to recommendation from documentation | |
# https://pymupdf.readthedocs.io/en/latest/installation.html#id1 | |
print("Trying to install required package PyMuPDF by '"+install_string+"'") | |
subprocess.call(install_string.split()) | |
print("Please restart the script!") | |
raise | |
# parse argvalues from commandline | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument("infile", | |
help="path to input pdf file (infile.pdf)") | |
parser.add_argument("outfile", nargs="?", | |
help="path to output pdf file (outfile.pdf)", default=None) | |
parser.add_argument("-v", "--verbose", help="display metadata before and after anonymization", | |
action="store_true") | |
args = parser.parse_args() | |
# set filenames from argument parser and set output filename based on input filename if no output filename is given | |
infile = args.infile | |
if args.outfile: | |
outfile = args.outfile | |
else: | |
outfile = infile.replace('.pdf', '.anonymized.pdf') | |
print("Using '" + outfile + "' as output filename for inputfile '" + infile + "'") | |
# open the document with pymupdf | |
doc = pymupdf.open(infile) | |
metadata = doc.metadata | |
# remove all metadata of the document except of fields specified in 'do_not_remove' | |
METADATA_FIELDS_TO_KEEP = ['format',] | |
if args.verbose: | |
print("document metadata before anonymization:\n", metadata.items()) | |
for k, v in metadata.items(): | |
if k not in METADATA_FIELDS_TO_KEEP: # select all except of those in keep-list | |
metadata[k] = '' | |
if args.verbose: | |
print("Document metadata after anonymization. You might check for retained infos:\n", metadata.items()) | |
doc.set_metadata(metadata) | |
# remove all specified fields from all annotations on all pages | |
ANNOTATION_FIELDS_TO_KEEP = ['content'] | |
ANNOTATION_FIELD_DEFAULTS = {'modDate': "D:20000000000000-00'00'", | |
'other': 'X'} | |
if args.verbose: | |
print("\nAnnotation metadata before anonymization:") | |
for page in doc: | |
for annot in page.annots(): | |
print(annot.info) | |
for page in doc: | |
for annot in page.annots(): | |
info = annot.info | |
if args.verbose: | |
print(annot.info) | |
for k, v in info.items(): | |
# remove all annotation info | |
# except of those in keep-list | |
if k not in ANNOTATION_FIELDS_TO_KEEP: | |
info[k] = ANNOTATION_FIELD_DEFAULTS.get( | |
k, ANNOTATION_FIELD_DEFAULTS['other']) | |
annot.set_info(info) | |
# annot.update() # can be omitted for annot.set_info() according to | |
# https://pymupdf.readthedocs.io/en/latest/annot.html#Annot.update | |
if args.verbose: | |
print("\nAnnotation metadata after anonymization. You might check for retained infos:") | |
for page in doc: | |
for annot in page.annots(): | |
print(annot.info) | |
# finally save document | |
doc.save(outfile) | |
print("\nSuccessfully saved an anonymized output to '" + outfile + "'!") |
MIT License | |
Copyright for portions of this fork are held by Vikas Dhiman, 2024, as part | |
of project https://gist.github.com/wecacuee/f1e92d421312b7c7c1907667f4f3a318. | |
All other copyright for this fork are held by github.com/vanschnapen, 2024. | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. |
Thanks @nylander for the comments on the gist!
Typo in line nr 12:
prnint("Please restart the script!")
.
I guess this part of the code never got checked once the necessary library was installed.
Indeed, I run in this error once, but forgot to revise it in my fork. It is revised now, as well as the full argument parsing.
[...] Furthermore, there are new recommendations for installing the
pymupdf
library (PyMuPDF documentation):pip install --upgrade pymupdf
.
Actually, I don't get the reason for an --upgrade
flag for an initial pip install of a package that was not importable by the script and, hence, is assumed to be not installed. Additionally, if a local candidate is present but not installed, I don't see a reason for forcing a user to fetch a new candidate (see pip documentation) However, I revised this part of the script and added a string variable that could easily by swapped by the user if --upgrade
flag is desired.
Typo in line nr 12:
prnint("Please restart the script!")
.I guess this part of the code never got checked once the necessary library was installed.
Furthermore, there are new recommendations for installing the
pymupdf
library (PyMuPDF documentation):pip install --upgrade pymupdf
.