Last active
May 26, 2024 17:20
-
-
Save iammosespaulr/c137dea7d35f810e9fb12493d062f0b8 to your computer and use it in GitHub Desktop.
Generates a custom MD5 hash for a PDF after stripping all the metadata and using a deterministic PDF ID
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import hashlib | |
import subprocess | |
from tempfile import NamedTemporaryFile | |
def get_hash_from_pdf(input_pdf): | |
with NamedTemporaryFile(delete=True) as temp_pdf: | |
# Run qpdf to strip metadata and output to a temporary file | |
subprocess.run([ | |
'qpdf', '-empty', '-static-id', '-pages', input_pdf, '1-z', '--', temp_pdf.name | |
], check=True, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) | |
# Compute and print MD5 hash of the final PDF (this is pretty much an invariant, tried it on a bunch of pdfs) | |
file_data = temp_pdf.read() | |
md5_hash = hashlib.md5(file_data).hexdigest() | |
return md5_hash | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python get_hash_from_pdf.py input_pdf.pdf") | |
sys.exit(1) | |
print(get_hash_from_pdf(sys.argv[1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The only dependency here is
qpdf
and that can be installed withsudo apt-get install qpdf
orbrew install qpdf