Skip to content

Instantly share code, notes, and snippets.

@janoliver
Last active February 12, 2025 13:29
Show Gist options
  • Save janoliver/063620261c8f12da0344fa3938b78577 to your computer and use it in GitHub Desktop.
Save janoliver/063620261c8f12da0344fa3938b78577 to your computer and use it in GitHub Desktop.
paperless-ngx pre-consume splitter script
#!/bin/env python3
"""
Simple paperless-ngx pre-consumption script for scanners, that support
no "proper" single-page PDF scanning, where a stack of paper is scanned
into individual PDF files, BUT double-sided pages are combined as
two pages into one PDF file, such as the Epson ES-580W.
This script takes a multi-page PDF document and splits it into
2-page documents. If the second page of those documents is empty, it'll
be removed. In order to use it, you should set your scanner to
"multi-page PDF" and then have it scan to a `single` subfolder of the
consumption folder. Only documents in this `single` folder are processed
by the script. Also, don't set the scanner to ignore blank pages, as they
are removed by this script instead.
Maybe you need to adjust the threshold for blank pages here, depending on
your scanner.
Requires Python 3.12!
"""
# Maybe adjust this, depends on the scanner.
BLANK_THRESHOLD = 0.8
# ------------ Don't change anything below.
import argparse
import os
import subprocess
import string
import shutil
import random
import pathlib
from itertools import batched
source_path = pathlib.Path(os.environ.get("DOCUMENT_SOURCE_PATH"))
work_path = pathlib.Path(os.environ.get("DOCUMENT_WORKING_PATH"))
salt = os.environ.get(
"TASK_ID",
''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
)[:5]
# Only process documents from the "single" folder
if source_path.parent.name != "single":
exit(0)
# Check if PDF has more than 2 pages
out = subprocess.check_output(["/usr/bin/pdfinfo", str(work_path)])
for l in out.splitlines():
if l.strip().startswith(b"Pages:"):
num_pages = int(l.split()[1])
if num_pages <= 2 or num_pages % 2 == 1:
print("Splitter: Less than two or odd number of pages. Exiting..")
exit(0)
print("Splitter: Found", num_pages, "pages to split")
break
# Split file
subprocess.run(["/usr/bin/pdfseparate", str(work_path), f"{work_path.parent}/{work_path.stem}-{salt}-%d.pdf"])
# Merge files
files = list(work_path.parent.glob(f"{work_path.stem}-{salt}-*.pdf"))
files.sort(key=lambda f: int(''.join(filter(str.isdigit, f.name))))
i = 1
for x, y in batched(files, n=2):
if i == 1:
result_name = str(work_path)
else:
result_name = f"{source_path.parent}/{work_path.stem}-{i}.pdf"
print("Splitter: Processing pages", (i-1)*2, " and ", (i-1)*2+1)
print("Splitter: Store result in", result_name)
# Check if blank page
out = subprocess.check_output(["/usr/bin/gs", "-o", "-", "-sDEVICE=ink_cov", y])
for l in out.splitlines():
if b"CMYK" in l:
blank_value = sum(map(float, l.split()[:4]))
break
if blank_value < BLANK_THRESHOLD:
print("Splitter: Second page is blank!")
# Keep only first page.
shutil.move(x, result_name)
pathlib.Path(y).unlink()
else:
subprocess.run(["/usr/bin/pdfunite", x, y, result_name])
pathlib.Path(x).unlink()
pathlib.Path(y).unlink()
print("Splitter: Finished", result_name)
i += 1
print("Splitter: Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment