Skip to content

Instantly share code, notes, and snippets.

@GluTbl
Last active July 3, 2021 09:00
Show Gist options
  • Save GluTbl/99a0571f61e82792ff4508e1084cc800 to your computer and use it in GitHub Desktop.
Save GluTbl/99a0571f61e82792ff4508e1084cc800 to your computer and use it in GitHub Desktop.
[Pdf Erorder] #python
import os
import shutil
import sys
import traceback
import uuid
import time
import cv2
import numpy as np
from fpdf import FPDF
from pdf2image import convert_from_path, pdfinfo_from_path
from tqdm import tqdm
try:
pdf_path:str=sys.argv[1].strip()
print(f"Prcessing :{pdf_path}")
# print(sys.argv)
except:
print("Imvalid argument found in python")
sys.exit()
if not pdf_path.endswith(".pdf"):
print("The given file is not pdf")
sys.exit()
if not os.path.isfile(pdf_path):
print(f"No such path \"{pdf_path}\"")
sys.exit()
WORKING = os.path.split(pdf_path)[0]
#############################
def show_list_progress(
the_list,
interactive: bool = True,
unit: str = "file",
description: str = None,
):
if not interactive:
return the_list
else:
return tqdm(
the_list,
dynamic_ncols=True,
unit=unit,
desc=description,
bar_format="{l_bar}{bar}|[{elapsed}<{remaining}, {rate_fmt}]",
)
def get_file_in_subfolder(path):
onlyfiles = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
return onlyfiles
#############################
tmp_folder_name = "tmp_" + str(uuid.uuid4())
TMP_FOLDER = os.path.join(WORKING, tmp_folder_name)
######################
if os.path.isfile(TMP_FOLDER) or os.path.isdir(TMP_FOLDER):
print(f'"{TMP_FOLDER}" is already exist!!!')
sys.exit()
os.makedirs(TMP_FOLDER)
######################
info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)
maxPages = info["Pages"]
PAGE_CHUNK = 2
for page in show_list_progress(range(1, maxPages + 1, PAGE_CHUNK), description="Spliting pages to images", unit="page"):
images_chunk = convert_from_path(pdf_path, dpi=600, first_page=page, last_page=min(page + PAGE_CHUNK - 1, maxPages))
count = page - 1
for image in images_chunk:
count += 1
image.save(os.path.join(TMP_FOLDER, f'{count}.jpg'), 'JPEG')
#############################
# lets modified the images_chunk
eroded_folder = os.path.join(TMP_FOLDER, "eroded")
os.makedirs(eroded_folder)
iamge_list = get_file_in_subfolder(TMP_FOLDER)
for img in show_list_progress(iamge_list, unit="image", description="eroding emages..."):
out = os.path.join(eroded_folder, os.path.split(img)[1])
kernel = np.ones((2, 2), np.uint8)
#kernel = np.ones((4, 4), np.uint8)
img_grey = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
img_erosion = cv2.erode(img_grey, kernel, iterations=4)
# /////////////greay scalling
thresh = 128
img_binary = cv2.threshold(img_erosion, thresh, 255, cv2.THRESH_BINARY)[1]
#save image
cv2.imwrite(out, img_binary)
# cv2.imwrite(out, img_erosion)
def make_pdf_file(filename_list, parent_image_folder, original_pdf: str):
suffix = "_eroded_pdf_file"
pdf_file = os.path.join(WORKING, os.path.split(original_pdf)[1] + f"{suffix}.pdf")
odd_file = os.path.join(WORKING, os.path.split(original_pdf)[1] + f"{suffix}_odd_actual_even_reversed_second.pdf")
even_file = os.path.join(WORKING, os.path.split(original_pdf)[1] + f"{suffix}_even_actiual_odd_first.pdf")
if os.path.isfile(pdf_file):
print("Already printed")
sys.exit()
pdf = FPDF('P', 'mm', 'A4')
odd_file_paths = []
even_file_paths = []
for num in range(1, len(filename_list) + 1):
image_name = os.path.join(parent_image_folder, str(num)) + ".jpg"
if num % 2 == 0:
even_file_paths.append(image_name)
else:
odd_file_paths.append(image_name)
pdf.add_page()
pdf.image(image_name, 0, 0, 210, 297)
print(f"write images: {pdf_file}")
pdf.output(pdf_file, "F")
#########################
###
# check page count
flag_page_mitch_match_odd = False
if len(odd_file_paths) != len(even_file_paths):
if len(odd_file_paths) > len(even_file_paths):
flag_page_mitch_match_odd = True
even_file_paths.append(None)
#raise Exception("Alert.. page miscount")
if len(odd_file_paths) < len(even_file_paths):
flag_page_mitch_match_odd = True
odd_file_paths.append(None)
raise Exception("Alert.. page miscount")
###
if len(odd_file_paths) != len(even_file_paths):
raise Exception("Alert.. page miscount[Final]")
odd_pdf = FPDF('P', 'mm', 'A4')
odd_file_paths.reverse()
for filename in odd_file_paths:
odd_pdf.add_page()
if filename is None:
continue
odd_pdf.image(filename, 0, 0, 210, 297)
print(f"write images: {odd_file}")
odd_pdf.output(odd_file, "F")
#####################
even_pdf = FPDF('P', 'mm', 'A4')
for filename in even_file_paths:
even_pdf.add_page()
if filename is None:
continue
even_pdf.image(filename, 0, 0, 210, 297)
print(f"write images: {even_file}")
even_pdf.output(even_file, "F")
try:
make_pdf_file(get_file_in_subfolder(eroded_folder), eroded_folder, pdf_path)
except Exception as e:
traceback.print_exc()
pass
time.sleep(2)
shutil.rmtree(TMP_FOLDER)
#!/bin/bash
if test "$#" -ne 1; then
echo "Illegal number of parameters"
exit
fi
re='^[0-9]+$'
if [[ $1 =~ $re ]] ; then
echo "Please pass a device namet!!!"
exit
fi
FILE=$PWD/$1
if test -f "$FILE"; then
real_pdf=$FILE
elif test -f "$1"; then
real_pdf=$1
fi
if [[ -z "${real_pdf}" ]]; then
echo $1" doesnot exist"
else
echo $real_pdf
python3 ./pdf_eroder.py "$real_pdf"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment