Skip to content

Instantly share code, notes, and snippets.

@mountain
Created November 20, 2025 02:39
Show Gist options
  • Select an option

  • Save mountain/0c0fc0a2d93b38d52450f970887798ba to your computer and use it in GitHub Desktop.

Select an option

Save mountain/0c0fc0a2d93b38d52450f970887798ba to your computer and use it in GitHub Desktop.
High quality converter to compress pdf
import fitz # PyMuPDF
import cv2
import numpy as np
import os
from tqdm import tqdm
class HighQualityMRCConverter:
def __init__(self, input_path, output_path, quality=60, downsample=2, dpi=300):
"""
高画质版初始化
:param quality: 背景层 JPEG 质量 (提高到 60 以保留纸张质感)
:param downsample: 背景降采样系数 (降为 2,由 3 改为 2,减少背景马赛克感)
:param dpi: 渲染分辨率 (提高到 300,保证数学符号清晰)
"""
self.input_path = input_path
self.output_path = output_path
self.jpeg_quality = quality
self.downsample_factor = downsample
self.dpi = dpi
def pixmap_to_cv2(self, pix):
img_data = np.frombuffer(pix.samples, dtype=np.uint8)
if pix.alpha:
img_data = img_data.reshape(pix.height, pix.width, 4)
return cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
else:
img_data = img_data.reshape(pix.height, pix.width, 3)
return cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
def get_text_mask(self, img):
"""
专门提取文字的掩膜
"""
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 1. 稍微温和一点的二值化,防止细小的数学符号丢失
# block_size 减小,C 减小,对细线条更敏感
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 15, 10
)
# 2. 极轻微的去噪,避免把小数点去掉
# 只有极小的噪点才会被移除
# kernel = np.ones((2,2), np.uint8)
# binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
return binary
def process_layer_background(self, img, text_mask):
"""
关键改进:背景清洗 (Background Whitening)
在降采样之前,先把文字区域抹白!
"""
# 1. 膨胀掩膜:让要抹除的区域比文字实际稍微大一点点
# 这样可以确保文字边缘的黑晕也被抹掉
kernel = np.ones((3, 3), np.uint8)
dilated_mask = cv2.dilate(text_mask, kernel, iterations=1)
# 2. 背景清洗:将掩膜区域的图像像素设为纯白 (255, 255, 255)
# 这就是解决“糊”和“重影”的核心代码
clean_bg = img.copy()
clean_bg[dilated_mask == 255] = [255, 255, 255]
# 3. 现在再进行模糊和降采样,背景里已经没有字了,只有纸张颜色
# 模糊半径可以减小,因为我们不需要模糊文字了
blurred = cv2.GaussianBlur(clean_bg, (5, 5), 0)
h, w = img.shape[:2]
new_w = int(w / self.downsample_factor)
new_h = int(h / self.downsample_factor)
bg_small = cv2.resize(blurred, (new_w, new_h), interpolation=cv2.INTER_AREA)
encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.jpeg_quality]
_, encoded_img = cv2.imencode('.jpg', bg_small, encode_param)
return encoded_img.tobytes()
def process_layer_foreground(self, text_mask):
"""
前景层直接使用提取好的 Mask
"""
h, w = text_mask.shape
foreground = np.zeros((h, w, 4), dtype=np.uint8)
# 文字部分设为黑色,不透明
# 注意:这里我们用原始的 mask (没膨胀的),保证文字锐利
foreground[text_mask == 255] = [0, 0, 0, 255]
encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 9]
_, encoded_img = cv2.imencode('.png', foreground, encode_param)
return encoded_img.tobytes()
def run(self):
if not os.path.exists(self.input_path):
print(f"Error: File '{self.input_path}' not found.")
return
print(f"Strategy: High-Res MRC (DPI={self.dpi}, Whitened Background)")
src_doc = fitz.open(self.input_path)
out_doc = fitz.open()
with tqdm(total=len(src_doc), desc="Compressing", unit="page", ncols=100) as pbar:
for page in src_doc:
# 1. 高清渲染
pix = page.get_pixmap(dpi=self.dpi)
img = self.pixmap_to_cv2(pix)
# 2. 获取文字掩膜 (一次计算,两层共用)
text_mask = self.get_text_mask(img)
# 3. 分层处理 (传入 mask 用于清洗背景)
bg_data = self.process_layer_background(img, text_mask)
fg_data = self.process_layer_foreground(text_mask)
# 4. 重组
new_page = out_doc.new_page(width=page.rect.width, height=page.rect.height)
new_page.insert_image(new_page.rect, stream=bg_data)
new_page.insert_image(new_page.rect, stream=fg_data)
pbar.update(1)
print(f"\nSaving optimized PDF...")
out_doc.save(self.output_path, garbage=4, deflate=True)
# 统计
original_size = os.path.getsize(self.input_path)
compressed_size = os.path.getsize(self.output_path)
ratio = (1 - compressed_size / original_size) * 100
print(
f"Done! {original_size / 1024 / 1024:.2f}MB -> {compressed_size / 1024 / 1024:.2f}MB (Reduced by {ratio:.1f}%)")
out_doc.close()
src_doc.close()
if __name__ == "__main__":
input_file = "topos_book.pdf"
output_file = "topos_book_c.pdf"
# 这里使用了更保守的压缩参数
converter = HighQualityMRCConverter(input_file, output_file, quality=60, downsample=2, dpi=300)
converter.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment