Last active
June 22, 2025 05:09
-
-
Save partrita/87b7b9492c64e19ae4c64d26ae2e01af to your computer and use it in GitHub Desktop.
simple python script to compress PDF file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pypdf import PdfReader, PdfWriter | |
| import argparse | |
| import os | |
| import sys | |
| import logging | |
| # Configure basic logging for better user feedback | |
| logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') | |
| def compress_pdf_lossless(input_path: str, output_path: str) -> None: | |
| """ | |
| Applies lossless compression to a PDF file by compressing its content streams. | |
| This can significantly reduce file size, especially for PDFs with redundant data. | |
| Args: | |
| input_path (str): Path to the input PDF file. | |
| output_path (str): Path where the compressed PDF will be saved. | |
| """ | |
| logging.info(f"Starting lossless compression for '{input_path}'...") | |
| try: | |
| reader = PdfReader(input_path) | |
| writer = PdfWriter() | |
| for page in reader.pages: | |
| writer.add_page(page) | |
| # Iterate through pages in the writer object and apply compression | |
| # Level 9 is the highest compression level (most CPU intensive) | |
| for i, page in enumerate(writer.pages): | |
| logging.debug(f"Compressing content streams for page {i+1}...") | |
| page.compress_content_streams(level=9) | |
| with open(output_path, "wb") as f: | |
| writer.write(f) | |
| original_size = os.path.getsize(input_path) | |
| compressed_size = os.path.getsize(output_path) | |
| reduction_percent = ((original_size - compressed_size) / original_size) * 100 if original_size > 0 else 0 | |
| logging.info(f"Lossless compression complete! Saved to '{output_path}'.") | |
| logging.info(f"Original size: {original_size / (1024*1024):.2f} MB") | |
| logging.info(f"Compressed size: {compressed_size / (1024*1024):.2f} MB") | |
| logging.info(f"Size reduction: {reduction_percent:.2f}%") | |
| except FileNotFoundError: | |
| logging.error(f"Error: Input file not found at '{input_path}'. Please check the path.") | |
| sys.exit(1) | |
| except Exception as e: | |
| logging.error(f"An unexpected error occurred during lossless compression: {e}", exc_info=True) | |
| sys.exit(1) | |
| def compress_pdf_lossy(input_path: str, output_path: str, quality: int = 80) -> None: | |
| """ | |
| Applies lossy compression to a PDF file by re-compressing its images. | |
| Higher quality values result in larger files but better image quality. | |
| Args: | |
| input_path (str): Path to the input PDF file. | |
| output_path (str): Path where the compressed PDF will be saved. | |
| quality (int): Image compression quality (0-100). Default is 80. | |
| """ | |
| if not (0 <= quality <= 100): | |
| logging.error("Error: Image quality must be between 0 and 100.") | |
| sys.exit(1) | |
| logging.info(f"Starting lossy compression (quality={quality}) for '{input_path}'...") | |
| try: | |
| reader = PdfReader(input_path) | |
| writer = PdfWriter() | |
| for page in reader.pages: | |
| writer.add_page(page) | |
| # Iterate through pages and images within each page to apply lossy compression | |
| images_compressed = 0 | |
| for i, page in enumerate(writer.pages): | |
| logging.debug(f"Processing images on page {i+1}...") | |
| for img_index, img in enumerate(page.images): | |
| # Replace the image with a re-compressed version | |
| # The 'quality' parameter affects the JPEG compression level | |
| img.replace(img.image, quality=quality) | |
| images_compressed += 1 | |
| if images_compressed == 0: | |
| logging.warning("No images found for lossy compression. The file size might not change significantly.") | |
| with open(output_path, "wb") as f: | |
| writer.write(f) | |
| original_size = os.path.getsize(input_path) | |
| compressed_size = os.path.getsize(output_path) | |
| reduction_percent = ((original_size - compressed_size) / original_size) * 100 if original_size > 0 else 0 | |
| logging.info(f"Lossy compression complete! Saved to '{output_path}'.") | |
| logging.info(f"Original size: {original_size / (1024*1024):.2f} MB") | |
| logging.info(f"Compressed size: {compressed_size / (1024*1024):.2f} MB") | |
| logging.info(f"Size reduction: {reduction_percent:.2f}%") | |
| except FileNotFoundError: | |
| logging.error(f"Error: Input file not found at '{input_path}'. Please check the path.") | |
| sys.exit(1) | |
| except Exception as e: | |
| logging.error(f"An unexpected error occurred during lossy compression: {e}", exc_info=True) | |
| sys.exit(1) | |
| --- | |
| ## Command-Line Interface (CLI) Setup | |
| ```python | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Compress PDF files using lossless or lossy methods.", | |
| formatter_class=argparse.RawTextHelpFormatter # For better formatting of help message | |
| ) | |
| # Required arguments | |
| parser.add_argument("input_file", type=str, help="Path to the input PDF file.") | |
| parser.add_argument("output_file", type=str, help="Path for the output compressed PDF file.") | |
| # Optional arguments for compression type and quality | |
| compression_group = parser.add_mutually_exclusive_group(required=True) | |
| compression_group.add_argument( | |
| "-l", "--lossless", action="store_true", | |
| help="Perform lossless compression (compresses content streams)." | |
| ) | |
| compression_group.add_argument( | |
| "-q", "--quality", type=int, choices=range(0, 101), metavar="[0-100]", | |
| help="Perform lossy compression on images with specified quality (0-100). Lower is smaller file, worse quality." | |
| ) | |
| args = parser.parse_args() | |
| # Validate input/output paths to prevent overwriting or non-existent files prematurely | |
| if not os.path.exists(args.input_file): | |
| logging.error(f"Input file '{args.input_file}' does not exist.") | |
| sys.exit(1) | |
| if os.path.abspath(args.input_file) == os.path.abspath(args.output_file): | |
| logging.error("Input and output file paths cannot be the same. This would overwrite the original file.") | |
| sys.exit(1) | |
| if args.lossless: | |
| compress_pdf_lossless(args.input_file, args.output_file) | |
| elif args.quality is not None: | |
| compress_pdf_lossy(args.input_file, args.output_file, args.quality) | |
| else: | |
| # This case should ideally not be reached due to mutually_exclusive_group and required=True | |
| logging.critical("No compression method specified. Use --lossless or --quality.") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment