Last active
September 30, 2023 10:01
-
-
Save v--/0dd7400444bbc2c02670e162147cb5fe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a GNU Make file specifically designed for building PDF books from a directory of images. | |
# It can process multiple images concurrently with `make --jobs=8`. | |
# | |
# Every book is different and has different processing requirements, | |
# and it often makes sense copying this file and adapting it for an individual book. | |
# Adapting usually involves modifying in non-trivial ways the targets for the individual pages. | |
# | |
# The script builds a PDF file with a table of contents and an OCR layer. | |
# I used to build DjVu files instead because of their better image compression, but I learned that | |
# it's better to provide a PDF myself than to let people use bad converters. | |
# Furthermore, PDF files with Group4-compressed bitonal images are as small as bitonal DjVu files. | |
# See the history of this gist for a script for building DjVu files. | |
# | |
# For building, we suppose that we have a directory of images named in ascending order, e.g. 0001.png, 0002.png, .... | |
# We also suppose that the name of the directory will be the name of the resulting book (this name can include spaces). | |
# A table of contents is added via bookmarks.txt in the same directory. The bookmark file format consists of blocks like | |
# | |
# BookmarkBegin | |
# BookmarkTitle: Chapter 1. Introduction | |
# BookmarkLevel: 1 | |
# BookmarkPageNumber: 7 | |
# | |
# BookmarkBegin | |
# BookmarkTitle: §1. Preliminary Notions | |
# BookmarkLevel: 2 | |
# BookmarkPageNumber: 8 | |
# | |
# (It is actually not limited to bookmarks and may contain other PDF metainformation). | |
# | |
# The script dependencies vary with the book, but they usually include | |
# * ImageMagick (https://imagemagick.org/) - for general-purpose image processing | |
# * unpaper (https://github.com/unpaper/unpaper) - for post-processing scanned pages | |
# * Tesseract OCR (https://github.com/tesseract-ocr/tesseract) - for OCR | |
# * ocrmypdf (https://github.com/ocrmypdf/OCRmyPDF) - for performing OCR on an existing PDF file via the above tool | |
# * Ghostscript (https://www.ghostscript.com/) - for processing PDF files | |
# * djvulibre (https://github.com/traycold/djvulibre) - for working with DjVu files | |
# * dpsprep (https://github.com/kcroker/dpsprep) - for converting DjVu to PDF | |
# | |
# I do not like to put licenses on my code, so consider this makefile Unlicensed (https://unlicense.org/). | |
# Configuration variables | |
OCR_LANGUAGES := rus+eng+grc # Even in a book in Cyrillic, math features Latin and Greek letters | |
PAGE_RANGE := $(shell seq --format '%04g' 3 104) # The list of pages to process | |
EXECUTABLES = magick unpaper ocrmypdf pdftk # The list of binaries whose non-existence will fail the build | |
# Some technical variables | |
E := # This whitespace trick is from https://stackoverflow.com/a/56411000 | |
OUTPUT_NAME_RAW := $(notdir $(shell pwd)) | |
OUTPUT_NAME := $(subst $E $E,\ ,$(OUTPUT_NAME_RAW)) | |
CHECKSUM := $(shell echo $(OUTPUT_NAME) | sum | cut --delimiter ' ' --fields 1) | |
TMP_DIR := /var/tmp/build-scanned-book/$(CHECKSUM) | |
PROCESSED_IMAGES := $(addprefix $(TMP_DIR)/,$(addsuffix _unpaper.pbm,$(PAGE_RANGE))) | |
# Check if the necessary executables exist | |
MISSING_DEPS := $(strip $(foreach exec, $(EXECUTABLES),\ | |
$(if $(shell which $(exec) 2>/dev/null),,$(exec)) \ | |
)) | |
$(if $(MISSING_DEPS),$(error Missing executables: $(MISSING_DEPS)),) | |
# Check if the necessary pages exist | |
MISSING_PAGES := $(strip $(foreach page, $(PAGE_RANGE),\ | |
$(if $(wildcard $(page).*),,$(page)) \ | |
)) | |
$(if $(MISSING_PAGES),$(error Missing pages: $(MISSING_PAGES)),) | |
.NOTINTERMEDIATE: | |
.PHONY: clean_tmp clean_pdf | |
# The main target that gets made by default | |
$(OUTPUT_NAME).pdf: $(TMP_DIR)/ocr.pdf bookmarks.txt | |
pdftk $< update_info_utf8 bookmarks.txt output '$@' | |
$(MAKE) clean_tmp | |
clean_tmp: | |
rm --recursive --force $(TMP_DIR) | |
clean_pdf: | |
rm --force '$(OUTPUT_NAME_RAW).pdf' | |
$(TMP_DIR): | |
mkdir --parents $(TMP_DIR) | |
# The intermediate targets | |
$(TMP_DIR)/%_magick.pbm: %.png | $(TMP_DIR) | |
magick $< -threshold 80% $@ | |
# We can easily specialize a rule for a certain page | |
# For example, we can use an adaptive threshold when converting to a bitonal image | |
$(TMP_DIR)/0030_magick.pbm: 0030.png | $(TMP_DIR) | |
magick $< -lat 20x20-5% $@ | |
# Another way to specialize is to use the if function | |
# It is useful for when a list of pages needs to behave differently | |
# Here we add --no-deskew for pages 10, 20 and 30 | |
# unpaper has bulk processing built-in, but we use it on a per-file basis | |
$(TMP_DIR)/%_unpaper.pbm: $(TMP_DIR)/%_magick.pbm | |
unpaper $< $(if $(filter $*,10 20 30),--no-deskew,) $@ | |
$(TMP_DIR)/combined.pdf: $(PROCESSED_IMAGES) | |
magick $^ -define pdf:Title='$(OUTPUT_NAME_RAW)' -verbose -compress group4 $@ | |
$(TMP_DIR)/ocr.pdf: $(TMP_DIR)/combined.pdf | |
ocrmypdf --language=$(OCR_LANGUAGES) $< $@ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment