Skip to content

Instantly share code, notes, and snippets.

@partrita
Last active June 22, 2025 05:09
Show Gist options
  • Select an option

  • Save partrita/87b7b9492c64e19ae4c64d26ae2e01af to your computer and use it in GitHub Desktop.

Select an option

Save partrita/87b7b9492c64e19ae4c64d26ae2e01af to your computer and use it in GitHub Desktop.
simple python script to compress PDF file
from pypdf import PdfReader, PdfWriter
import argparse
import os
import sys
import logging
# Configure basic logging for better user feedback
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def compress_pdf_lossless(input_path: str, output_path: str) -> None:
"""
Applies lossless compression to a PDF file by compressing its content streams.
This can significantly reduce file size, especially for PDFs with redundant data.
Args:
input_path (str): Path to the input PDF file.
output_path (str): Path where the compressed PDF will be saved.
"""
logging.info(f"Starting lossless compression for '{input_path}'...")
try:
reader = PdfReader(input_path)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
# Iterate through pages in the writer object and apply compression
# Level 9 is the highest compression level (most CPU intensive)
for i, page in enumerate(writer.pages):
logging.debug(f"Compressing content streams for page {i+1}...")
page.compress_content_streams(level=9)
with open(output_path, "wb") as f:
writer.write(f)
original_size = os.path.getsize(input_path)
compressed_size = os.path.getsize(output_path)
reduction_percent = ((original_size - compressed_size) / original_size) * 100 if original_size > 0 else 0
logging.info(f"Lossless compression complete! Saved to '{output_path}'.")
logging.info(f"Original size: {original_size / (1024*1024):.2f} MB")
logging.info(f"Compressed size: {compressed_size / (1024*1024):.2f} MB")
logging.info(f"Size reduction: {reduction_percent:.2f}%")
except FileNotFoundError:
logging.error(f"Error: Input file not found at '{input_path}'. Please check the path.")
sys.exit(1)
except Exception as e:
logging.error(f"An unexpected error occurred during lossless compression: {e}", exc_info=True)
sys.exit(1)
def compress_pdf_lossy(input_path: str, output_path: str, quality: int = 80) -> None:
"""
Applies lossy compression to a PDF file by re-compressing its images.
Higher quality values result in larger files but better image quality.
Args:
input_path (str): Path to the input PDF file.
output_path (str): Path where the compressed PDF will be saved.
quality (int): Image compression quality (0-100). Default is 80.
"""
if not (0 <= quality <= 100):
logging.error("Error: Image quality must be between 0 and 100.")
sys.exit(1)
logging.info(f"Starting lossy compression (quality={quality}) for '{input_path}'...")
try:
reader = PdfReader(input_path)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
# Iterate through pages and images within each page to apply lossy compression
images_compressed = 0
for i, page in enumerate(writer.pages):
logging.debug(f"Processing images on page {i+1}...")
for img_index, img in enumerate(page.images):
# Replace the image with a re-compressed version
# The 'quality' parameter affects the JPEG compression level
img.replace(img.image, quality=quality)
images_compressed += 1
if images_compressed == 0:
logging.warning("No images found for lossy compression. The file size might not change significantly.")
with open(output_path, "wb") as f:
writer.write(f)
original_size = os.path.getsize(input_path)
compressed_size = os.path.getsize(output_path)
reduction_percent = ((original_size - compressed_size) / original_size) * 100 if original_size > 0 else 0
logging.info(f"Lossy compression complete! Saved to '{output_path}'.")
logging.info(f"Original size: {original_size / (1024*1024):.2f} MB")
logging.info(f"Compressed size: {compressed_size / (1024*1024):.2f} MB")
logging.info(f"Size reduction: {reduction_percent:.2f}%")
except FileNotFoundError:
logging.error(f"Error: Input file not found at '{input_path}'. Please check the path.")
sys.exit(1)
except Exception as e:
logging.error(f"An unexpected error occurred during lossy compression: {e}", exc_info=True)
sys.exit(1)
---
## Command-Line Interface (CLI) Setup
```python
def main():
parser = argparse.ArgumentParser(
description="Compress PDF files using lossless or lossy methods.",
formatter_class=argparse.RawTextHelpFormatter # For better formatting of help message
)
# Required arguments
parser.add_argument("input_file", type=str, help="Path to the input PDF file.")
parser.add_argument("output_file", type=str, help="Path for the output compressed PDF file.")
# Optional arguments for compression type and quality
compression_group = parser.add_mutually_exclusive_group(required=True)
compression_group.add_argument(
"-l", "--lossless", action="store_true",
help="Perform lossless compression (compresses content streams)."
)
compression_group.add_argument(
"-q", "--quality", type=int, choices=range(0, 101), metavar="[0-100]",
help="Perform lossy compression on images with specified quality (0-100). Lower is smaller file, worse quality."
)
args = parser.parse_args()
# Validate input/output paths to prevent overwriting or non-existent files prematurely
if not os.path.exists(args.input_file):
logging.error(f"Input file '{args.input_file}' does not exist.")
sys.exit(1)
if os.path.abspath(args.input_file) == os.path.abspath(args.output_file):
logging.error("Input and output file paths cannot be the same. This would overwrite the original file.")
sys.exit(1)
if args.lossless:
compress_pdf_lossless(args.input_file, args.output_file)
elif args.quality is not None:
compress_pdf_lossy(args.input_file, args.output_file, args.quality)
else:
# This case should ideally not be reached due to mutually_exclusive_group and required=True
logging.critical("No compression method specified. Use --lossless or --quality.")
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment