Created
November 20, 2025 02:39
-
-
Save mountain/0c0fc0a2d93b38d52450f970887798ba to your computer and use it in GitHub Desktop.
High quality converter to compress pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import fitz # PyMuPDF | |
| import cv2 | |
| import numpy as np | |
| import os | |
| from tqdm import tqdm | |
| class HighQualityMRCConverter: | |
| def __init__(self, input_path, output_path, quality=60, downsample=2, dpi=300): | |
| """ | |
| 高画质版初始化 | |
| :param quality: 背景层 JPEG 质量 (提高到 60 以保留纸张质感) | |
| :param downsample: 背景降采样系数 (降为 2,由 3 改为 2,减少背景马赛克感) | |
| :param dpi: 渲染分辨率 (提高到 300,保证数学符号清晰) | |
| """ | |
| self.input_path = input_path | |
| self.output_path = output_path | |
| self.jpeg_quality = quality | |
| self.downsample_factor = downsample | |
| self.dpi = dpi | |
| def pixmap_to_cv2(self, pix): | |
| img_data = np.frombuffer(pix.samples, dtype=np.uint8) | |
| if pix.alpha: | |
| img_data = img_data.reshape(pix.height, pix.width, 4) | |
| return cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR) | |
| else: | |
| img_data = img_data.reshape(pix.height, pix.width, 3) | |
| return cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR) | |
| def get_text_mask(self, img): | |
| """ | |
| 专门提取文字的掩膜 | |
| """ | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| # 1. 稍微温和一点的二值化,防止细小的数学符号丢失 | |
| # block_size 减小,C 减小,对细线条更敏感 | |
| binary = cv2.adaptiveThreshold( | |
| gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, | |
| cv2.THRESH_BINARY_INV, 15, 10 | |
| ) | |
| # 2. 极轻微的去噪,避免把小数点去掉 | |
| # 只有极小的噪点才会被移除 | |
| # kernel = np.ones((2,2), np.uint8) | |
| # binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel) | |
| return binary | |
| def process_layer_background(self, img, text_mask): | |
| """ | |
| 关键改进:背景清洗 (Background Whitening) | |
| 在降采样之前,先把文字区域抹白! | |
| """ | |
| # 1. 膨胀掩膜:让要抹除的区域比文字实际稍微大一点点 | |
| # 这样可以确保文字边缘的黑晕也被抹掉 | |
| kernel = np.ones((3, 3), np.uint8) | |
| dilated_mask = cv2.dilate(text_mask, kernel, iterations=1) | |
| # 2. 背景清洗:将掩膜区域的图像像素设为纯白 (255, 255, 255) | |
| # 这就是解决“糊”和“重影”的核心代码 | |
| clean_bg = img.copy() | |
| clean_bg[dilated_mask == 255] = [255, 255, 255] | |
| # 3. 现在再进行模糊和降采样,背景里已经没有字了,只有纸张颜色 | |
| # 模糊半径可以减小,因为我们不需要模糊文字了 | |
| blurred = cv2.GaussianBlur(clean_bg, (5, 5), 0) | |
| h, w = img.shape[:2] | |
| new_w = int(w / self.downsample_factor) | |
| new_h = int(h / self.downsample_factor) | |
| bg_small = cv2.resize(blurred, (new_w, new_h), interpolation=cv2.INTER_AREA) | |
| encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.jpeg_quality] | |
| _, encoded_img = cv2.imencode('.jpg', bg_small, encode_param) | |
| return encoded_img.tobytes() | |
| def process_layer_foreground(self, text_mask): | |
| """ | |
| 前景层直接使用提取好的 Mask | |
| """ | |
| h, w = text_mask.shape | |
| foreground = np.zeros((h, w, 4), dtype=np.uint8) | |
| # 文字部分设为黑色,不透明 | |
| # 注意:这里我们用原始的 mask (没膨胀的),保证文字锐利 | |
| foreground[text_mask == 255] = [0, 0, 0, 255] | |
| encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 9] | |
| _, encoded_img = cv2.imencode('.png', foreground, encode_param) | |
| return encoded_img.tobytes() | |
| def run(self): | |
| if not os.path.exists(self.input_path): | |
| print(f"Error: File '{self.input_path}' not found.") | |
| return | |
| print(f"Strategy: High-Res MRC (DPI={self.dpi}, Whitened Background)") | |
| src_doc = fitz.open(self.input_path) | |
| out_doc = fitz.open() | |
| with tqdm(total=len(src_doc), desc="Compressing", unit="page", ncols=100) as pbar: | |
| for page in src_doc: | |
| # 1. 高清渲染 | |
| pix = page.get_pixmap(dpi=self.dpi) | |
| img = self.pixmap_to_cv2(pix) | |
| # 2. 获取文字掩膜 (一次计算,两层共用) | |
| text_mask = self.get_text_mask(img) | |
| # 3. 分层处理 (传入 mask 用于清洗背景) | |
| bg_data = self.process_layer_background(img, text_mask) | |
| fg_data = self.process_layer_foreground(text_mask) | |
| # 4. 重组 | |
| new_page = out_doc.new_page(width=page.rect.width, height=page.rect.height) | |
| new_page.insert_image(new_page.rect, stream=bg_data) | |
| new_page.insert_image(new_page.rect, stream=fg_data) | |
| pbar.update(1) | |
| print(f"\nSaving optimized PDF...") | |
| out_doc.save(self.output_path, garbage=4, deflate=True) | |
| # 统计 | |
| original_size = os.path.getsize(self.input_path) | |
| compressed_size = os.path.getsize(self.output_path) | |
| ratio = (1 - compressed_size / original_size) * 100 | |
| print( | |
| f"Done! {original_size / 1024 / 1024:.2f}MB -> {compressed_size / 1024 / 1024:.2f}MB (Reduced by {ratio:.1f}%)") | |
| out_doc.close() | |
| src_doc.close() | |
| if __name__ == "__main__": | |
| input_file = "topos_book.pdf" | |
| output_file = "topos_book_c.pdf" | |
| # 这里使用了更保守的压缩参数 | |
| converter = HighQualityMRCConverter(input_file, output_file, quality=60, downsample=2, dpi=300) | |
| converter.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment