Last active
February 12, 2025 13:29
-
-
Save janoliver/063620261c8f12da0344fa3938b78577 to your computer and use it in GitHub Desktop.
paperless-ngx pre-consume splitter script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
""" | |
Simple paperless-ngx pre-consumption script for scanners, that support | |
no "proper" single-page PDF scanning, where a stack of paper is scanned | |
into individual PDF files, BUT double-sided pages are combined as | |
two pages into one PDF file, such as the Epson ES-580W. | |
This script takes a multi-page PDF document and splits it into | |
2-page documents. If the second page of those documents is empty, it'll | |
be removed. In order to use it, you should set your scanner to | |
"multi-page PDF" and then have it scan to a `single` subfolder of the | |
consumption folder. Only documents in this `single` folder are processed | |
by the script. Also, don't set the scanner to ignore blank pages, as they | |
are removed by this script instead. | |
Maybe you need to adjust the threshold for blank pages here, depending on | |
your scanner. | |
Requires Python 3.12! | |
""" | |
# Maybe adjust this, depends on the scanner. | |
BLANK_THRESHOLD = 0.8 | |
# ------------ Don't change anything below. | |
import argparse | |
import os | |
import subprocess | |
import string | |
import shutil | |
import random | |
import pathlib | |
from itertools import batched | |
source_path = pathlib.Path(os.environ.get("DOCUMENT_SOURCE_PATH")) | |
work_path = pathlib.Path(os.environ.get("DOCUMENT_WORKING_PATH")) | |
salt = os.environ.get( | |
"TASK_ID", | |
''.join(random.choices(string.ascii_uppercase + string.digits, k=5)) | |
)[:5] | |
# Only process documents from the "single" folder | |
if source_path.parent.name != "single": | |
exit(0) | |
# Check if PDF has more than 2 pages | |
out = subprocess.check_output(["/usr/bin/pdfinfo", str(work_path)]) | |
for l in out.splitlines(): | |
if l.strip().startswith(b"Pages:"): | |
num_pages = int(l.split()[1]) | |
if num_pages <= 2 or num_pages % 2 == 1: | |
print("Splitter: Less than two or odd number of pages. Exiting..") | |
exit(0) | |
print("Splitter: Found", num_pages, "pages to split") | |
break | |
# Split file | |
subprocess.run(["/usr/bin/pdfseparate", str(work_path), f"{work_path.parent}/{work_path.stem}-{salt}-%d.pdf"]) | |
# Merge files | |
files = list(work_path.parent.glob(f"{work_path.stem}-{salt}-*.pdf")) | |
files.sort(key=lambda f: int(''.join(filter(str.isdigit, f.name)))) | |
i = 1 | |
for x, y in batched(files, n=2): | |
if i == 1: | |
result_name = str(work_path) | |
else: | |
result_name = f"{source_path.parent}/{work_path.stem}-{i}.pdf" | |
print("Splitter: Processing pages", (i-1)*2, " and ", (i-1)*2+1) | |
print("Splitter: Store result in", result_name) | |
# Check if blank page | |
out = subprocess.check_output(["/usr/bin/gs", "-o", "-", "-sDEVICE=ink_cov", y]) | |
for l in out.splitlines(): | |
if b"CMYK" in l: | |
blank_value = sum(map(float, l.split()[:4])) | |
break | |
if blank_value < BLANK_THRESHOLD: | |
print("Splitter: Second page is blank!") | |
# Keep only first page. | |
shutil.move(x, result_name) | |
pathlib.Path(y).unlink() | |
else: | |
subprocess.run(["/usr/bin/pdfunite", x, y, result_name]) | |
pathlib.Path(x).unlink() | |
pathlib.Path(y).unlink() | |
print("Splitter: Finished", result_name) | |
i += 1 | |
print("Splitter: Done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment