janoliver · February 12, 2025 13:29
diff --git a/split.py b/split.py
 #!/bin/env python3

 """
 Simple paperless-ngx pre-consumption script for scanners, that support
 no "proper" single-page PDF scanning, where a stack of paper is scanned
 into individual PDF files, BUT double-sided pages are combined as 
 two pages into one PDF file, such as the Epson ES-580W.

 This script takes a multi-page PDF document and splits it into 
 2-page documents. If the second page of those documents is empty, it'll
 be removed. In order to use it, you should set your scanner to 
 "multi-page PDF" and then have it scan to a `single` subfolder of the 
 consumption folder. Only documents in this `single` folder are processed
 by the script. Also, don't set the scanner to ignore blank pages, as they
 are removed by this script instead. 

 Maybe you need to adjust the threshold for blank pages here, depending on 
 your scanner.

 Requires Python 3.12!
 """

 # Maybe adjust this, depends on the scanner.
 BLANK_THRESHOLD = 0.8

 # ------------ Don't change anything below.

 import argparse
 import os
 import subprocess
 import string
 import shutil
 import random
 import pathlib
 from itertools import batched

 source_path = pathlib.Path(os.environ.get("DOCUMENT_SOURCE_PATH"))
 work_path = pathlib.Path(os.environ.get("DOCUMENT_WORKING_PATH"))
 salt = os.environ.get(
    "TASK_ID", 
    ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
 )[:5]

 # Only process documents from the "single" folder
 if source_path.parent.name != "single":
    exit(0)

 # Check if PDF has more than 2 pages
 out = subprocess.check_output(["/usr/bin/pdfinfo", str(work_path)])
 for l in out.splitlines():
    if l.strip().startswith(b"Pages:"):
        num_pages = int(l.split()[1])

        if num_pages <= 2 or num_pages % 2 == 1:
            print("Splitter: Less than two or odd number of pages. Exiting..")
            exit(0)

        print("Splitter: Found", num_pages, "pages to split")
        break

 # Split file
 subprocess.run(["/usr/bin/pdfseparate", str(work_path), f"{work_path.parent}/{work_path.stem}-{salt}-%d.pdf"])

 # Merge files
 files = list(work_path.parent.glob(f"{work_path.stem}-{salt}-*.pdf"))
 files.sort(key=lambda f: int(''.join(filter(str.isdigit, f.name))))

 i = 1
 for x, y in batched(files, n=2):

    if i == 1:
        result_name = str(work_path)
    else:
        result_name = f"{source_path.parent}/{work_path.stem}-{i}.pdf"

    print("Splitter: Processing pages", (i-1)*2, " and ", (i-1)*2+1)
    print("Splitter: Store result in", result_name)

    # Check if blank page
    out = subprocess.check_output(["/usr/bin/gs", "-o", "-", "-sDEVICE=ink_cov", y])
    for l in out.splitlines():
        if b"CMYK" in l:
            blank_value = sum(map(float, l.split()[:4]))
            break
    
    if blank_value < BLANK_THRESHOLD:
        print("Splitter: Second page is blank!")
        # Keep only first page. 
        shutil.move(x, result_name)
        pathlib.Path(y).unlink()
    else:
        subprocess.run(["/usr/bin/pdfunite", x, y, result_name])
        pathlib.Path(x).unlink()
        pathlib.Path(y).unlink()
    print("Splitter: Finished", result_name)

    i += 1

 print("Splitter: Done")
	#!/bin/env python3

	"""
	Simple paperless-ngx pre-consumption script for scanners, that support
	no "proper" single-page PDF scanning, where a stack of paper is scanned
	into individual PDF files, BUT double-sided pages are combined as
	two pages into one PDF file, such as the Epson ES-580W.

	This script takes a multi-page PDF document and splits it into
	2-page documents. If the second page of those documents is empty, it'll
	be removed. In order to use it, you should set your scanner to
	"multi-page PDF" and then have it scan to a `single` subfolder of the
	consumption folder. Only documents in this `single` folder are processed
	by the script. Also, don't set the scanner to ignore blank pages, as they
	are removed by this script instead.

	Maybe you need to adjust the threshold for blank pages here, depending on
	your scanner.

	Requires Python 3.12!
	"""

	# Maybe adjust this, depends on the scanner.
	BLANK_THRESHOLD = 0.8

	# ------------ Don't change anything below.

	import argparse
	import os
	import subprocess
	import string
	import shutil
	import random
	import pathlib
	from itertools import batched

	source_path = pathlib.Path(os.environ.get("DOCUMENT_SOURCE_PATH"))
	work_path = pathlib.Path(os.environ.get("DOCUMENT_WORKING_PATH"))
	salt = os.environ.get(
	"TASK_ID",
	''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
	)[:5]

	# Only process documents from the "single" folder
	if source_path.parent.name != "single":
	exit(0)

	# Check if PDF has more than 2 pages
	out = subprocess.check_output(["/usr/bin/pdfinfo", str(work_path)])
	for l in out.splitlines():
	if l.strip().startswith(b"Pages:"):
	num_pages = int(l.split()[1])

	if num_pages <= 2 or num_pages % 2 == 1:
	print("Splitter: Less than two or odd number of pages. Exiting..")
	exit(0)

	print("Splitter: Found", num_pages, "pages to split")
	break

	# Split file
	subprocess.run(["/usr/bin/pdfseparate", str(work_path), f"{work_path.parent}/{work_path.stem}-{salt}-%d.pdf"])

	# Merge files
	files = list(work_path.parent.glob(f"{work_path.stem}-{salt}-*.pdf"))
	files.sort(key=lambda f: int(''.join(filter(str.isdigit, f.name))))

	i = 1
	for x, y in batched(files, n=2):

	if i == 1:
	result_name = str(work_path)
	else:
	result_name = f"{source_path.parent}/{work_path.stem}-{i}.pdf"

	print("Splitter: Processing pages", (i-1)2, " and ", (i-1)2+1)
	print("Splitter: Store result in", result_name)

	# Check if blank page
	out = subprocess.check_output(["/usr/bin/gs", "-o", "-", "-sDEVICE=ink_cov", y])
	for l in out.splitlines():
	if b"CMYK" in l:
	blank_value = sum(map(float, l.split()[:4]))
	break

	if blank_value < BLANK_THRESHOLD:
	print("Splitter: Second page is blank!")
	# Keep only first page.
	shutil.move(x, result_name)
	pathlib.Path(y).unlink()
	else:
	subprocess.run(["/usr/bin/pdfunite", x, y, result_name])
	pathlib.Path(x).unlink()
	pathlib.Path(y).unlink()
	print("Splitter: Finished", result_name)

	i += 1

	print("Splitter: Done")