Created
November 3, 2023 21:59
-
-
Save jlumbroso/3142108d7884d1c7d1dba1a4b10aa9af to your computer and use it in GitHub Desktop.
A Python script to recursively decrypt PDF files using `qpdf --decrypt`, handling files without passwords and overwriting the originals if decryption is successful.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
decrypt_all_pdfs.py | |
Author: Jérémie Lumbroso <[email protected]> | |
Date: November 3, 2023 | |
Description: | |
This script recursively finds all PDF files within a specified directory path, | |
checks if they are encrypted (without a password), and attempts to decrypt them | |
using `qpdf --decrypt`. It may require modifications if PDF files are password-protected. | |
This script is distributed under the MIT License. | |
Usage: | |
python decrypt_all_pdfs.py /path/to/pdf/directory | |
""" | |
import glob | |
import os | |
import shutil | |
import subprocess | |
import sys | |
import tempfile | |
# MIT License | |
# Copyright (c) 2023 Jérémie Lumbroso | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
def detect_pdf_decryption_status(pdf_path): | |
# Create a temporary directory | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
# Copy the PDF file into the temporary directory | |
temp_pdf_path = shutil.copy(pdf_path, tmpdirname) | |
# Construct the output text file path | |
text_file_path = os.path.splitext(temp_pdf_path)[0] + '.txt' | |
# Call pdftotext on the copied file | |
stderr_file_path = os.path.join(tmpdirname, 'stderr.txt') | |
with open(stderr_file_path, 'w') as stderr_file: | |
result = subprocess.run(['pdftotext', temp_pdf_path], stderr=stderr_file) | |
# Check if the text file is created | |
text_file_created = os.path.exists(text_file_path) | |
# Check if there is output on stderr | |
stderr_output = os.path.getsize(stderr_file_path) > 0 | |
# Clean up is handled by the TemporaryDirectory context manager | |
# Return the final piece of information | |
return text_file_created and not stderr_output | |
def decrypt_pdf(pdf_path): | |
# Ensure the qpdf is available | |
if shutil.which("qpdf") is None: | |
raise RuntimeError("qpdf is not installed or not found in system PATH.") | |
# Create a temporary directory | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
# Copy the PDF file into the temporary directory | |
temp_pdf_path = shutil.copy(pdf_path, tmpdirname) | |
# Construct the decrypted PDF file path | |
decrypted_pdf_path = os.path.splitext(temp_pdf_path)[0] + '.decrypted.pdf' | |
# Call qpdf to decrypt the copied PDF file | |
subprocess.run(['qpdf', '--decrypt', temp_pdf_path, decrypted_pdf_path], check=True) | |
# Check if the decrypted file was created successfully | |
if not os.path.exists(decrypted_pdf_path): | |
raise FileNotFoundError("Decryption failed, decrypted file not found.") | |
# Copy the decrypted PDF file back to overwrite the original file | |
shutil.copy(decrypted_pdf_path, pdf_path) | |
# Temporary directory and its contents will be automatically cleaned up | |
def decrypt_all_pdfs(path): | |
# Recursively glob all PDF files in the given directory path | |
for pdf_file in glob.glob(os.path.join(path, '**/*.pdf'), recursive=True): | |
# Check if the PDF is encrypted | |
if not detect_pdf_decryption_status(pdf_file): | |
print(f"Decrypting: {pdf_file}") | |
try: | |
decrypt_pdf(pdf_file) | |
print(f"Decryption successful for: {pdf_file}") | |
except Exception as e: | |
print(f"An error occurred while decrypting {pdf_file}: {e}") | |
else: | |
print(f"File is not encrypted or already decrypted: {pdf_file}") | |
# Example usage: | |
if __name__ == "__main__": | |
# Use the first command line argument if provided, otherwise default to the current working directory | |
path_to_pdfs = sys.argv[1] if len(sys.argv) > 1 else os.getcwd() | |
decrypt_all_pdfs(path_to_pdfs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment