mountain · November 20, 2025 02:39
diff --git a/pdfc.py b/pdfc.py
 import fitz  # PyMuPDF
 import cv2
 import numpy as np
 import os
 from tqdm import tqdm


 class HighQualityMRCConverter:
    def __init__(self, input_path, output_path, quality=60, downsample=2, dpi=300):
        """
        高画质版初始化
        :param quality: 背景层 JPEG 质量 (提高到 60 以保留纸张质感)
        :param downsample: 背景降采样系数 (降为 2，由 3 改为 2，减少背景马赛克感)
        :param dpi: 渲染分辨率 (提高到 300，保证数学符号清晰)
        """
        self.input_path = input_path
        self.output_path = output_path
        self.jpeg_quality = quality
        self.downsample_factor = downsample
        self.dpi = dpi

    def pixmap_to_cv2(self, pix):
        img_data = np.frombuffer(pix.samples, dtype=np.uint8)
        if pix.alpha:
            img_data = img_data.reshape(pix.height, pix.width, 4)
            return cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
        else:
            img_data = img_data.reshape(pix.height, pix.width, 3)
            return cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)

    def get_text_mask(self, img):
        """
        专门提取文字的掩膜
        """
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # 1. 稍微温和一点的二值化，防止细小的数学符号丢失
        # block_size 减小，C 减小，对细线条更敏感
        binary = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY_INV, 15, 10
        )

        # 2. 极轻微的去噪，避免把小数点去掉
        # 只有极小的噪点才会被移除
        # kernel = np.ones((2,2), np.uint8)
        # binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

        return binary

    def process_layer_background(self, img, text_mask):
        """
        关键改进：背景清洗 (Background Whitening)
        在降采样之前，先把文字区域抹白！
        """
        # 1. 膨胀掩膜：让要抹除的区域比文字实际稍微大一点点
        # 这样可以确保文字边缘的黑晕也被抹掉
        kernel = np.ones((3, 3), np.uint8)
        dilated_mask = cv2.dilate(text_mask, kernel, iterations=1)

        # 2. 背景清洗：将掩膜区域的图像像素设为纯白 (255, 255, 255)
        # 这就是解决“糊”和“重影”的核心代码
        clean_bg = img.copy()
        clean_bg[dilated_mask == 255] = [255, 255, 255]

        # 3. 现在再进行模糊和降采样，背景里已经没有字了，只有纸张颜色
        # 模糊半径可以减小，因为我们不需要模糊文字了
        blurred = cv2.GaussianBlur(clean_bg, (5, 5), 0)

        h, w = img.shape[:2]
        new_w = int(w / self.downsample_factor)
        new_h = int(h / self.downsample_factor)
        bg_small = cv2.resize(blurred, (new_w, new_h), interpolation=cv2.INTER_AREA)

        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.jpeg_quality]
        _, encoded_img = cv2.imencode('.jpg', bg_small, encode_param)
        return encoded_img.tobytes()

    def process_layer_foreground(self, text_mask):
        """
        前景层直接使用提取好的 Mask
        """
        h, w = text_mask.shape
        foreground = np.zeros((h, w, 4), dtype=np.uint8)

        # 文字部分设为黑色，不透明
        # 注意：这里我们用原始的 mask (没膨胀的)，保证文字锐利
        foreground[text_mask == 255] = [0, 0, 0, 255]

        encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 9]
        _, encoded_img = cv2.imencode('.png', foreground, encode_param)
        return encoded_img.tobytes()

    def run(self):
        if not os.path.exists(self.input_path):
            print(f"Error: File '{self.input_path}' not found.")
            return

        print(f"Strategy: High-Res MRC (DPI={self.dpi}, Whitened Background)")

        src_doc = fitz.open(self.input_path)
        out_doc = fitz.open()

        with tqdm(total=len(src_doc), desc="Compressing", unit="page", ncols=100) as pbar:
            for page in src_doc:
                # 1. 高清渲染
                pix = page.get_pixmap(dpi=self.dpi)
                img = self.pixmap_to_cv2(pix)

                # 2. 获取文字掩膜 (一次计算，两层共用)
                text_mask = self.get_text_mask(img)

                # 3. 分层处理 (传入 mask 用于清洗背景)
                bg_data = self.process_layer_background(img, text_mask)
                fg_data = self.process_layer_foreground(text_mask)

                # 4. 重组
                new_page = out_doc.new_page(width=page.rect.width, height=page.rect.height)
                new_page.insert_image(new_page.rect, stream=bg_data)
                new_page.insert_image(new_page.rect, stream=fg_data)

                pbar.update(1)

        print(f"\nSaving optimized PDF...")
        out_doc.save(self.output_path, garbage=4, deflate=True)

        # 统计
        original_size = os.path.getsize(self.input_path)
        compressed_size = os.path.getsize(self.output_path)
        ratio = (1 - compressed_size / original_size) * 100
        print(
            f"Done! {original_size / 1024 / 1024:.2f}MB -> {compressed_size / 1024 / 1024:.2f}MB (Reduced by {ratio:.1f}%)")

        out_doc.close()
        src_doc.close()


 if __name__ == "__main__":
    input_file = "topos_book.pdf"
    output_file = "topos_book_c.pdf"

    # 这里使用了更保守的压缩参数
    converter = HighQualityMRCConverter(input_file, output_file, quality=60, downsample=2, dpi=300)
    converter.run()
	import fitz # PyMuPDF
	import cv2
	import numpy as np
	import os
	from tqdm import tqdm


	class HighQualityMRCConverter:
	def __init__(self, input_path, output_path, quality=60, downsample=2, dpi=300):
	"""
	高画质版初始化
	:param quality: 背景层 JPEG 质量 (提高到 60 以保留纸张质感)
	:param downsample: 背景降采样系数 (降为 2，由 3 改为 2，减少背景马赛克感)
	:param dpi: 渲染分辨率 (提高到 300，保证数学符号清晰)
	"""
	self.input_path = input_path
	self.output_path = output_path
	self.jpeg_quality = quality
	self.downsample_factor = downsample
	self.dpi = dpi

	def pixmap_to_cv2(self, pix):
	img_data = np.frombuffer(pix.samples, dtype=np.uint8)
	if pix.alpha:
	img_data = img_data.reshape(pix.height, pix.width, 4)
	return cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
	else:
	img_data = img_data.reshape(pix.height, pix.width, 3)
	return cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)

	def get_text_mask(self, img):
	"""
	专门提取文字的掩膜
	"""
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# 1. 稍微温和一点的二值化，防止细小的数学符号丢失
	# block_size 减小，C 减小，对细线条更敏感
	binary = cv2.adaptiveThreshold(
	gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY_INV, 15, 10
	)

	# 2. 极轻微的去噪，避免把小数点去掉
	# 只有极小的噪点才会被移除
	# kernel = np.ones((2,2), np.uint8)
	# binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

	return binary

	def process_layer_background(self, img, text_mask):
	"""
	关键改进：背景清洗 (Background Whitening)
	在降采样之前，先把文字区域抹白！
	"""
	# 1. 膨胀掩膜：让要抹除的区域比文字实际稍微大一点点
	# 这样可以确保文字边缘的黑晕也被抹掉
	kernel = np.ones((3, 3), np.uint8)
	dilated_mask = cv2.dilate(text_mask, kernel, iterations=1)

	# 2. 背景清洗：将掩膜区域的图像像素设为纯白 (255, 255, 255)
	# 这就是解决“糊”和“重影”的核心代码
	clean_bg = img.copy()
	clean_bg[dilated_mask == 255] = [255, 255, 255]

	# 3. 现在再进行模糊和降采样，背景里已经没有字了，只有纸张颜色
	# 模糊半径可以减小，因为我们不需要模糊文字了
	blurred = cv2.GaussianBlur(clean_bg, (5, 5), 0)

	h, w = img.shape[:2]
	new_w = int(w / self.downsample_factor)
	new_h = int(h / self.downsample_factor)
	bg_small = cv2.resize(blurred, (new_w, new_h), interpolation=cv2.INTER_AREA)

	encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), self.jpeg_quality]
	_, encoded_img = cv2.imencode('.jpg', bg_small, encode_param)
	return encoded_img.tobytes()

	def process_layer_foreground(self, text_mask):
	"""
	前景层直接使用提取好的 Mask
	"""
	h, w = text_mask.shape
	foreground = np.zeros((h, w, 4), dtype=np.uint8)

	# 文字部分设为黑色，不透明
	# 注意：这里我们用原始的 mask (没膨胀的)，保证文字锐利
	foreground[text_mask == 255] = [0, 0, 0, 255]

	encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 9]
	_, encoded_img = cv2.imencode('.png', foreground, encode_param)
	return encoded_img.tobytes()

	def run(self):
	if not os.path.exists(self.input_path):
	print(f"Error: File '{self.input_path}' not found.")
	return

	print(f"Strategy: High-Res MRC (DPI={self.dpi}, Whitened Background)")

	src_doc = fitz.open(self.input_path)
	out_doc = fitz.open()

	with tqdm(total=len(src_doc), desc="Compressing", unit="page", ncols=100) as pbar:
	for page in src_doc:
	# 1. 高清渲染
	pix = page.get_pixmap(dpi=self.dpi)
	img = self.pixmap_to_cv2(pix)

	# 2. 获取文字掩膜 (一次计算，两层共用)
	text_mask = self.get_text_mask(img)

	# 3. 分层处理 (传入 mask 用于清洗背景)
	bg_data = self.process_layer_background(img, text_mask)
	fg_data = self.process_layer_foreground(text_mask)

	# 4. 重组
	new_page = out_doc.new_page(width=page.rect.width, height=page.rect.height)
	new_page.insert_image(new_page.rect, stream=bg_data)
	new_page.insert_image(new_page.rect, stream=fg_data)

	pbar.update(1)

	print(f"\nSaving optimized PDF...")
	out_doc.save(self.output_path, garbage=4, deflate=True)

	# 统计
	original_size = os.path.getsize(self.input_path)
	compressed_size = os.path.getsize(self.output_path)
	ratio = (1 - compressed_size / original_size) * 100
	print(
	f"Done! {original_size / 1024 / 1024:.2f}MB -> {compressed_size / 1024 / 1024:.2f}MB (Reduced by {ratio:.1f}%)")

	out_doc.close()
	src_doc.close()


	if __name__ == "__main__":
	input_file = "topos_book.pdf"
	output_file = "topos_book_c.pdf"

	# 这里使用了更保守的压缩参数
	converter = HighQualityMRCConverter(input_file, output_file, quality=60, downsample=2, dpi=300)
	converter.run()
No results found