Created
February 25, 2024 16:09
-
-
Save alephpi/693404ff752150aaf35b387729b9e5c2 to your computer and use it in GitHub Desktop.
pdf clean script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fitz # PyMuPDF | |
import os | |
from PIL import Image | |
from tqdm import tqdm | |
import cv2 | |
import numpy as np | |
import img2pdf | |
H = 1280 | |
def pdf_to_png(*args): | |
imgs = [] | |
pdf_path, output_folder = args | |
# 打开 PDF 文件 | |
pdf_document = fitz.open(pdf_path) | |
# 遍历每一页,保存为 PNG 图片 | |
pbar = tqdm(range(pdf_document.page_count), desc=f"Converting {pdf_path.split('/')[-1]}") | |
for page_number in pbar: | |
page = pdf_document[page_number] | |
image = page.get_pixmap() | |
h = image.height | |
ratio = H / h | |
# we need high enough resolution | |
if ratio > 1: | |
image = page.get_pixmap(matrix=fitz.Matrix(ratio, ratio)) | |
# 转换为 Pillow 图像 | |
pil_image = Image.frombytes("RGB", [image.width, image.height], image.samples) | |
np_image = np.array(pil_image.convert('L')) | |
# _, processed = cv2.threshold(np_image, 150, 255, cv2.THRESH_BINARY) | |
np_image[np_image > 200] = 255 | |
np_image[np_image < 127] = 0 | |
# np_image[(np_image >= 127) & (np_image <= 200)] = 127 | |
imgs.append(np_image) | |
# _,processed = cv2.threshold(np_image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) | |
# 保存为 PNG 图片 | |
output_path = f"{output_folder}/{page_number + 1}.png" | |
cv2.imwrite(output_path, np_image) | |
# 关闭 PDF 文件 | |
pdf_document.close() | |
image_folder = output_folder | |
image_files = [f for f in os.listdir(image_folder) if f.endswith('.png')] | |
image_files.sort(key=lambda x: int(x.removesuffix('.png'))) # 确保按顺序处理图像 | |
# 将所有图像转换为PDF | |
pdf_bytes = img2pdf.convert([os.path.join(image_folder, f) for f in image_files]) | |
with open('./output2.pdf', 'wb') as f: | |
f.write(pdf_bytes) | |
if __name__=='__main__': | |
pdf_to_png('./音韵学概论_麦耘.pdf', './output/') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment