v-- · September 30, 2023 10:01
diff --git a/scanned_books.makefile b/scanned_books.makefile
 # This is a GNU Make file specifically designed for building PDF books from a directory of images.
 # It can process multiple images concurrently with `make --jobs=8`.
 #
 # Every book is different and has different processing requirements,
 # and it often makes sense copying this file and adapting it for an individual book.
 # Adapting usually involves modifying in non-trivial ways the targets for the individual pages.
 #
 # The script builds a PDF file with a table of contents and an OCR layer.
 # I used to build DjVu files instead because of their better image compression, but I learned that
 # it's better to provide a PDF myself than to let people use bad converters.
 # Furthermore, PDF files with Group4-compressed bitonal images are as small as bitonal DjVu files.
 # See the history of this gist for a script for building DjVu files.
 # 
 # For building, we suppose that we have a directory of images named in ascending order, e.g. 0001.png, 0002.png, ....
 # We also suppose that the name of the directory will be the name of the resulting book (this name can include spaces).
 # A table of contents is added via bookmarks.txt in the same directory. The bookmark file format consists of blocks like
 #
 # BookmarkBegin
 # BookmarkTitle: Chapter 1. Introduction
 # BookmarkLevel: 1
 # BookmarkPageNumber: 7
 #
 # BookmarkBegin
 # BookmarkTitle: §1. Preliminary Notions
 # BookmarkLevel: 2
 # BookmarkPageNumber: 8
 #
 # (It is actually not limited to bookmarks and may contain other PDF metainformation).
 #
 # The script dependencies vary with the book, but they usually include
 # * ImageMagick (https://imagemagick.org/) - for general-purpose image processing
 # * unpaper (https://github.com/unpaper/unpaper) - for post-processing scanned pages
 # * Tesseract OCR (https://github.com/tesseract-ocr/tesseract) - for OCR
 # * ocrmypdf (https://github.com/ocrmypdf/OCRmyPDF) - for performing OCR on an existing PDF file via the above tool
 # * Ghostscript (https://www.ghostscript.com/) - for processing PDF files
 # * djvulibre (https://github.com/traycold/djvulibre) - for working with DjVu files
 # * dpsprep (https://github.com/kcroker/dpsprep) - for converting DjVu to PDF
 #
 # I do not like to put licenses on my code, so consider this makefile Unlicensed (https://unlicense.org/).

 # Configuration variables
 OCR_LANGUAGES := rus+eng+grc # Even in a book in Cyrillic, math features Latin and Greek letters
 PAGE_RANGE := $(shell seq --format '%04g' 3 104) # The list of pages to process
 EXECUTABLES = magick unpaper ocrmypdf pdftk # The list of binaries whose non-existence will fail the build

 # Some technical variables
 E := # This whitespace trick is from https://stackoverflow.com/a/56411000
 OUTPUT_NAME_RAW := $(notdir $(shell pwd))
 OUTPUT_NAME := $(subst $E $E,\ ,$(OUTPUT_NAME_RAW))
 CHECKSUM := $(shell echo $(OUTPUT_NAME) | sum | cut --delimiter ' ' --fields 1)
 TMP_DIR := /var/tmp/build-scanned-book/$(CHECKSUM)
 PROCESSED_IMAGES := $(addprefix $(TMP_DIR)/,$(addsuffix _unpaper.pbm,$(PAGE_RANGE)))

 # Check if the necessary executables exist
 MISSING_DEPS := $(strip $(foreach exec, $(EXECUTABLES),\
 	$(if $(shell which $(exec) 2>/dev/null),,$(exec)) \
 ))

 $(if $(MISSING_DEPS),$(error Missing executables: $(MISSING_DEPS)),)

 # Check if the necessary pages exist
 MISSING_PAGES := $(strip $(foreach page, $(PAGE_RANGE),\
 	$(if $(wildcard $(page).*),,$(page)) \
 ))

 $(if $(MISSING_PAGES),$(error Missing pages: $(MISSING_PAGES)),)

 .NOTINTERMEDIATE:
 .PHONY: clean_tmp clean_pdf

 # The main target that gets made by default
 $(OUTPUT_NAME).pdf: $(TMP_DIR)/ocr.pdf bookmarks.txt
 	pdftk $< update_info_utf8 bookmarks.txt output '$@'
 	$(MAKE) clean_tmp

 clean_tmp:
 	rm --recursive --force $(TMP_DIR)

 clean_pdf:
 	rm --force '$(OUTPUT_NAME_RAW).pdf'

 $(TMP_DIR):
 	mkdir --parents $(TMP_DIR)

 # The intermediate targets
 $(TMP_DIR)/%_magick.pbm: %.png | $(TMP_DIR)
 	magick $< -threshold 80% $@

 # We can easily specialize a rule for a certain page
 # For example, we can use an adaptive threshold when converting to a bitonal image
 $(TMP_DIR)/0030_magick.pbm: 0030.png | $(TMP_DIR)
 	magick $< -lat 20x20-5% $@

 # Another way to specialize is to use the if function
 # It is useful for when a list of pages needs to behave differently
 # Here we add --no-deskew for pages 10, 20 and 30
 # unpaper has bulk processing built-in, but we use it on a per-file basis
 $(TMP_DIR)/%_unpaper.pbm: $(TMP_DIR)/%_magick.pbm
 	unpaper $< $(if $(filter $*,10 20 30),--no-deskew,) $@

 $(TMP_DIR)/combined.pdf: $(PROCESSED_IMAGES)
 	magick $^ -define pdf:Title='$(OUTPUT_NAME_RAW)' -verbose -compress group4 $@

 $(TMP_DIR)/ocr.pdf: $(TMP_DIR)/combined.pdf
 	ocrmypdf --language=$(OCR_LANGUAGES) $< $@
	# This is a GNU Make file specifically designed for building PDF books from a directory of images.
	# It can process multiple images concurrently with `make --jobs=8`.
	#
	# Every book is different and has different processing requirements,
	# and it often makes sense copying this file and adapting it for an individual book.
	# Adapting usually involves modifying in non-trivial ways the targets for the individual pages.
	#
	# The script builds a PDF file with a table of contents and an OCR layer.
	# I used to build DjVu files instead because of their better image compression, but I learned that
	# it's better to provide a PDF myself than to let people use bad converters.
	# Furthermore, PDF files with Group4-compressed bitonal images are as small as bitonal DjVu files.
	# See the history of this gist for a script for building DjVu files.
	#
	# For building, we suppose that we have a directory of images named in ascending order, e.g. 0001.png, 0002.png, ....
	# We also suppose that the name of the directory will be the name of the resulting book (this name can include spaces).
	# A table of contents is added via bookmarks.txt in the same directory. The bookmark file format consists of blocks like
	#
	# BookmarkBegin
	# BookmarkTitle: Chapter 1. Introduction
	# BookmarkLevel: 1
	# BookmarkPageNumber: 7
	#
	# BookmarkBegin
	# BookmarkTitle: §1. Preliminary Notions
	# BookmarkLevel: 2
	# BookmarkPageNumber: 8
	#
	# (It is actually not limited to bookmarks and may contain other PDF metainformation).
	#
	# The script dependencies vary with the book, but they usually include
	# * ImageMagick (https://imagemagick.org/) - for general-purpose image processing
	# * unpaper (https://github.com/unpaper/unpaper) - for post-processing scanned pages
	# * Tesseract OCR (https://github.com/tesseract-ocr/tesseract) - for OCR
	# * ocrmypdf (https://github.com/ocrmypdf/OCRmyPDF) - for performing OCR on an existing PDF file via the above tool
	# * Ghostscript (https://www.ghostscript.com/) - for processing PDF files
	# * djvulibre (https://github.com/traycold/djvulibre) - for working with DjVu files
	# * dpsprep (https://github.com/kcroker/dpsprep) - for converting DjVu to PDF
	#
	# I do not like to put licenses on my code, so consider this makefile Unlicensed (https://unlicense.org/).

	# Configuration variables
	OCR_LANGUAGES := rus+eng+grc # Even in a book in Cyrillic, math features Latin and Greek letters
	PAGE_RANGE := $(shell seq --format '%04g' 3 104) # The list of pages to process
	EXECUTABLES = magick unpaper ocrmypdf pdftk # The list of binaries whose non-existence will fail the build

	# Some technical variables
	E := # This whitespace trick is from https://stackoverflow.com/a/56411000
	OUTPUT_NAME_RAW := $(notdir $(shell pwd))
	OUTPUT_NAME := $(subst $E $E,\ ,$(OUTPUT_NAME_RAW))
	CHECKSUM := $(shell echo $(OUTPUT_NAME) \| sum \| cut --delimiter ' ' --fields 1)
	TMP_DIR := /var/tmp/build-scanned-book/$(CHECKSUM)
	PROCESSED_IMAGES := $(addprefix $(TMP_DIR)/,$(addsuffix _unpaper.pbm,$(PAGE_RANGE)))

	# Check if the necessary executables exist
	MISSING_DEPS := $(strip $(foreach exec, $(EXECUTABLES),\
	$(if $(shell which $(exec) 2>/dev/null),,$(exec)) \
	))

	$(if $(MISSING_DEPS),$(error Missing executables: $(MISSING_DEPS)),)

	# Check if the necessary pages exist
	MISSING_PAGES := $(strip $(foreach page, $(PAGE_RANGE),\
	$(if $(wildcard $(page).*),,$(page)) \
	))

	$(if $(MISSING_PAGES),$(error Missing pages: $(MISSING_PAGES)),)

	.NOTINTERMEDIATE:
	.PHONY: clean_tmp clean_pdf

	# The main target that gets made by default
	$(OUTPUT_NAME).pdf: $(TMP_DIR)/ocr.pdf bookmarks.txt
	pdftk $< update_info_utf8 bookmarks.txt output '$@'
	$(MAKE) clean_tmp

	clean_tmp:
	rm --recursive --force $(TMP_DIR)

	clean_pdf:
	rm --force '$(OUTPUT_NAME_RAW).pdf'

	$(TMP_DIR):
	mkdir --parents $(TMP_DIR)

	# The intermediate targets
	$(TMP_DIR)/%_magick.pbm: %.png \| $(TMP_DIR)
	magick $< -threshold 80% $@

	# We can easily specialize a rule for a certain page
	# For example, we can use an adaptive threshold when converting to a bitonal image
	$(TMP_DIR)/0030_magick.pbm: 0030.png \| $(TMP_DIR)
	magick $< -lat 20x20-5% $@

	# Another way to specialize is to use the if function
	# It is useful for when a list of pages needs to behave differently
	# Here we add --no-deskew for pages 10, 20 and 30
	# unpaper has bulk processing built-in, but we use it on a per-file basis
	$(TMP_DIR)/%_unpaper.pbm: $(TMP_DIR)/%_magick.pbm
	unpaper $< $(if $(filter $*,10 20 30),--no-deskew,) $@

	$(TMP_DIR)/combined.pdf: $(PROCESSED_IMAGES)
	magick $^ -define pdf:Title='$(OUTPUT_NAME_RAW)' -verbose -compress group4 $@

	$(TMP_DIR)/ocr.pdf: $(TMP_DIR)/combined.pdf
	ocrmypdf --language=$(OCR_LANGUAGES) $< $@