Skip to content

Instantly share code, notes, and snippets.

@alephpi
Created February 25, 2024 16:09
Show Gist options
  • Save alephpi/693404ff752150aaf35b387729b9e5c2 to your computer and use it in GitHub Desktop.
Save alephpi/693404ff752150aaf35b387729b9e5c2 to your computer and use it in GitHub Desktop.
pdf clean script
import fitz # PyMuPDF
import os
from PIL import Image
from tqdm import tqdm
import cv2
import numpy as np
import img2pdf
H = 1280
def pdf_to_png(*args):
imgs = []
pdf_path, output_folder = args
# 打开 PDF 文件
pdf_document = fitz.open(pdf_path)
# 遍历每一页,保存为 PNG 图片
pbar = tqdm(range(pdf_document.page_count), desc=f"Converting {pdf_path.split('/')[-1]}")
for page_number in pbar:
page = pdf_document[page_number]
image = page.get_pixmap()
h = image.height
ratio = H / h
# we need high enough resolution
if ratio > 1:
image = page.get_pixmap(matrix=fitz.Matrix(ratio, ratio))
# 转换为 Pillow 图像
pil_image = Image.frombytes("RGB", [image.width, image.height], image.samples)
np_image = np.array(pil_image.convert('L'))
# _, processed = cv2.threshold(np_image, 150, 255, cv2.THRESH_BINARY)
np_image[np_image > 200] = 255
np_image[np_image < 127] = 0
# np_image[(np_image >= 127) & (np_image <= 200)] = 127
imgs.append(np_image)
# _,processed = cv2.threshold(np_image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
# 保存为 PNG 图片
output_path = f"{output_folder}/{page_number + 1}.png"
cv2.imwrite(output_path, np_image)
# 关闭 PDF 文件
pdf_document.close()
image_folder = output_folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.png')]
image_files.sort(key=lambda x: int(x.removesuffix('.png'))) # 确保按顺序处理图像
# 将所有图像转换为PDF
pdf_bytes = img2pdf.convert([os.path.join(image_folder, f) for f in image_files])
with open('./output2.pdf', 'wb') as f:
f.write(pdf_bytes)
if __name__=='__main__':
pdf_to_png('./音韵学概论_麦耘.pdf', './output/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment