-
-
Save wecacuee/f1e92d421312b7c7c1907667f4f3a318 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python | |
| try: | |
| import fitz | |
| except ImportError as e: | |
| import subprocess | |
| print('Trying to install pip install PyMuPDF') | |
| subprocess.call("pip install PyMuPDF".split()) | |
| print('Try pip install PyMuPDF') | |
| raise | |
| import sys | |
| if len(sys.argv) < 2: | |
| raise ValueError("Please provide a pdf to anonymize") | |
| if len(sys.argv) < 3: | |
| outfilename = filename.replace('.pdf', '.anon.pdf') | |
| else: | |
| outfilename = sys.argv[2] | |
| filename = sys.argv[1] | |
| doc = fitz.open(filename) | |
| metadata = doc.metadata | |
| for k, v in metadata.items(): | |
| if k not in ['format']: # retain some metadata | |
| metadata[k] = '' | |
| doc.set_metadata(metadata) | |
| FIELDS_TO_KEEP = ['content'] | |
| FIELD_DEFAULTS = { 'modDate' : "D:20000000000000-00'00'", | |
| 'other' : 'X'} | |
| for page in doc: | |
| for annot in page.annots(): | |
| info = annot.info | |
| # print("Before", info) | |
| for k, v in info.items(): | |
| # remove all annotation info | |
| # other than the content | |
| if k not in FIELDS_TO_KEEP: | |
| info[k] = FIELD_DEFAULTS.get( | |
| k, FIELD_DEFAULTS['other']) | |
| # print("After", info) | |
| annot.set_info(info) | |
| annot.update() | |
| print("Printing annotations. Check for retained info in the pdf.") | |
| for page in doc: | |
| for annot in page.annots(): | |
| print(annot.info) | |
| doc.save(outfilename) |
| MIT License | |
| Copyright (c) 2024 Vikas Dhiman | |
| Permission is hereby granted, free of charge, to any person obtaining a copy | |
| of this software and associated documentation files (the "Software"), to deal | |
| in the Software without restriction, including without limitation the rights | |
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| copies of the Software, and to permit persons to whom the Software is | |
| furnished to do so, subject to the following conditions: | |
| The above copyright notice and this permission notice shall be included in all | |
| copies or substantial portions of the Software. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| SOFTWARE. |
I stepped across this gist, since I had the identical problem as the op in https://softwarerecs.stackexchange.com/questions/31971/pdf-anonymizer-remove-potentially-identifying-metadata and indeed I wanted to remove author information from all annotations, so I ended in this gist.
I can reproduce the first problem of @jellepoland and found more bugs in the code. However, I don't know what @jellepoland means with his second bullet. Could you elaborate on this problem?
- the code does remove the names from annotations made inside a document viewer pdf
I am able to review the code and fix the problems. I am running Ubuntu 22.04.5 LTS (Ubuntu Budgie), with Python 3.10.12. This should also work for @jellepoland with Fedora.
@wecacuee could you please add a license to this gist? I would be glad to fork and send pull requests, provided it is FOSS licensed.
@vanschnapen I added MIT License. Please feel free to fork and update.
Updated to remove all info other than the content from annotations.
Thanks @wecacuee! With your commit 5d06efff181f5b99dea6234d01df617c21ecc14b the most urgent fix is done. Empty strings like before title="" were not possible and resulted in no changes in the annotations.
I will continue with my changes in the fork and inform you.
On Linux (Fedora) script is not working as expected.
frontendandfritzhad to be installed manually.