Created
May 25, 2020 13:23
-
-
Save lukas-buergi/47a2daea1f03456373c33334165b54b6 to your computer and use it in GitHub Desktop.
Post process scanned books
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
####################################################################### | |
# Copyright Lukas Bürgi 2020 | |
# | |
# This file is free software: you can redistribute it and/or modify it | |
# under the terms of the GNU Affero General Public License as published | |
# by the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This file is distributed in the hope that it will be useful, but WITHOUT | |
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public | |
# License for more details. | |
# | |
# You should have received a copy of the GNU Affero General Public | |
# License along with this file. If not, see | |
# <https://www.gnu.org/licenses/>. | |
######################################################################## | |
# How to use: ./compress file.pdf [denoise] | |
# If you skip the denoise, it should be lossless, but will be extremely | |
# slow and result in somewhat bigger files. | |
# With denoise enabled, jpegs will be cropped to remove borders | |
# containing only noise (leaving a considerable safety margin) and on | |
# all other images some measures to remove black speckles will be taken | |
# (which should also be pretty safe, but only if the images are high | |
# quality and contain text, everything else could be mangled beyond use). | |
# | |
# Caveats: | |
# There are some fixed assumptions which will cause problems if untrue: | |
# * Each pdf page consists of a single image | |
# * jpeg pages are covers and the like and can't be compressed further | |
# * All pages in other formats are black and white text | |
# There is also a possible bug with jbig2enc which leads to a fixed | |
# pixel density metadata setting of 72dpi for text pages. To fix this | |
# you need to modify pdf.py around line 150 to replace | |
# if xres == 0: | |
# xres = dpi | |
# if yres == 0: | |
# yres = dpi | |
# with | |
# xres = dpi | |
# yres = dpi | |
# where dpi is a variable that is configured at the beginning of the file | |
# and needs to be set to the dpi of your scan, for example 600. | |
# | |
# Prerequisites: | |
# Put the files jbig2 and pdf.py from | |
# https://github.com/agl/jbig2enc into the same directory as this script | |
# and install | |
# pdftk imagemagick time pdfimages parallel jpegtran img2pdf | |
# so that they are on the path (in contrast to the jbig2 utilities, they | |
# should be included in most distributions so that they are assumed to be | |
# installed). This file won't work if called from path, i.e. don't try to | |
# install it. | |
######################################################################## | |
# make bash behave more like a proper language | |
set -euo pipefail | |
shopt -s nullglob | |
# make debugging easier | |
set -x | |
# bin folder | |
bin=$(pwd)/$(dirname "$0") | |
# input/output file folder | |
iofolder="$(dirname "$(realpath "$1")")" | |
iofile="$iofolder/$(basename "$1")" | |
exec > "$iofile".log-1.txt | |
exec 2> "$iofile".log-2.txt | |
pdfImagesList="$(pdfimages -list "$iofile")" | |
numberOfImages=$(( $(echo "$pdfImagesList" | wc -l ) - 2 )) | |
numberOfPages=$(pdftk "$iofile" dump_data | grep -Fe 'NumberOfPages' | grep -oe '[0-9]*') | |
if test $numberOfImages -ne $numberOfPages; then | |
echo "File $iofile is not a standard scanned pdf with one image file per page. Skipping." | |
exit 0 | |
fi | |
numberOfJpegPages=$(echo "$pdfImagesList" | grep -e 'jpeg' | wc -l || true) | |
numberOfTextPages=$(( "$numberOfImages" - "$numberOfJpegPages" )) | |
if test $numberOfTextPages -eq 0; then | |
echo "File consists of just jpegs, can't do much. Skipping." | |
exit 0 | |
fi | |
jbig2presence="$( echo "$pdfImagesList" | grep -oe 'jbig2' || [[ $? == 1 ]] )" | |
if test "$jbig2presence" != ""; then | |
echo "Input already contains jbig2 encoded images, it was probably already processed. Skipping." | |
exit 0 | |
fi | |
# put all temporary files into this folder | |
workdir="$iofolder/tmp-$(basename "$1")" | |
mkdir "$workdir" | |
# extract original images | |
SECONDS=0 | |
mkdir "$workdir"/original | |
pdfimages -tiff -j "$1" "$workdir"/original/page | |
cd "$workdir"/original | |
rename 's/page-(...\....)/page-0$1/' *.??? | |
rename 's/page-(....\....)/page-0$1/' *.??? | |
echo "Extracted individual pages from pdf in $SECONDS seconds." | |
# remove noise if wanted | |
if test $# -eq 2; then | |
if test "$2" != "denoise"; then | |
exit 1 | |
fi | |
mkdir "$workdir"/denoised | |
cd "$workdir"/original | |
# Crop the jpeg files -- comment out if not needed | |
SECONDS=0 | |
for f in *.jpg; do | |
#area="$(convert "$f" -virtual-pixel edge -blur 0x100 -fuzz 5% -trim -format '%wx%h%O' info:)" | |
# the blurring is very slow | |
# improvement: | |
originalDimensions=$(identify -format "%[fx:w]x%[fx:h]" "$f") | |
convert "$f" -resize 1% -virtual-pixel edge -blur 0x2 "$f".small.jpg | |
# scale back up to exact dimensions (not using factor which would introduce inacceptable rounding errors) | |
area="$(convert "$f".small.jpg -resize "$originalDimensions" -fuzz 2% -trim -format '%wx%h%O' info:)" | |
rm "$f".small.jpg | |
jpegtran -crop "$area" "$f" > ../denoised/"$f" | |
done | |
echo "Cropped all extracted jpegs in $SECONDS seconds." | |
SECONDS=0 | |
for f in *.tif; do | |
outfile=../denoised/"${f%.tif}.png" | |
messages="$(convert "$f" -morphology Close Diamond:1 -define connected-components:area-threshold=30 -define connected-components:mean-color=true -connected-components 4 "$outfile" 2>&1 || true)" | |
if echo "$messages" | grep -e 'too many objects' >/dev/null; then | |
echo "Non-text detected and left untouched: $f (zero-indexed)." | |
convert "$f" "$outfile" | |
fi | |
done | |
echo "Despeckled all text pages in $SECONDS seconds." | |
cd "$workdir" | |
ln -s denoised/ assemble-input | |
else | |
cd "$workdir"/original | |
SECONDS=0 | |
for f in *.tif; do | |
sem -j +0 convert "$f" "${f%.tif}.png" | |
done | |
sem --wait | |
echo "Converted all text pages from tiff to png in $SECONDS seconds." | |
cd "$workdir" | |
ln -s original/ assemble-input | |
fi | |
# wrap jpegs in pdfs | |
SECONDS=0 | |
cd "$workdir"/assemble-input | |
for f in *.jpg; do | |
img2pdf --nodate "$f" --output "$workdir/assemble-input/${f%.jpg}.pdf" | |
done | |
if test $numberOfTextPages -gt 0; then | |
# compress text | |
cd "$workdir" | |
mkdir jbig2-output | |
cd jbig2-output | |
SECONDS=0 | |
"$bin"/jbig2 -s -p ../assemble-input/*.png 2>/dev/null | |
echo "Compressed text pages using jbig2 in $SECONDS seconds." | |
SECONDS=0 | |
"$bin"/pdf.py output > jbig2.pdf | |
echo "Wrapped jbig2 images in pdf in $SECONDS seconds." | |
# insert pdf-wrapped jpegs in right place in jbig2-pdf | |
SECONDS=0 | |
cd "$workdir"/assemble-input | |
cp ../jbig2-output/jbig2.pdf assembly.pdf | |
for f in ../original/*.jpg; do | |
# copy file because I'm not sure if there will be a problem if input and output file are equal | |
cp assembly.pdf tmp.pdf | |
# get pdf-file-name | |
pdfFilename="$(basename "$f" .jpg).pdf" | |
# get page number from $f | |
pageNumber="${pdfFilename%.pdf}" | |
pageNumber="10#${pageNumber#page-}" | |
pageNumber="$(("$pageNumber"+1))" | |
# get total number of pages | |
pageTotal=$(find . -regex '\./page-[0-9]*\....' -prune -print | grep -c /) | |
# get number of pages concatenated so far | |
pagesSoFar=$(pdftk tmp.pdf dump_data | grep -Fe 'NumberOfPages' | grep -oe '[0-9]*') | |
# get range1 and range2 | |
if test 1 -eq "$pageNumber"; then | |
range1="1even" # ranges left empty default to full, but this contradictory range is truly empty | |
range2="A" | |
elif test 2 -eq "$pageNumber"; then | |
range1="A1" | |
range2="A2-end" | |
elif test "$(("$pagesSoFar"+1))" -eq "$pageNumber"; then | |
range1="A" | |
range2="1even" # empty | |
else | |
range1="A1-$pageNumber" | |
range2="A$(("$pageNumber" + 1))-end" | |
fi | |
# insert pdf at correct place | |
pdftk A=tmp.pdf I="$pdfFilename" cat "$range1" I "$range2" output assembly.pdf | |
done | |
echo "Put jpeg pages into pdf in $SECONDS seconds." | |
else | |
cd "$workdir"/assemble-input | |
pdftk *.pdf cat output assembly.pdf | |
fi | |
compressedSize=$(du assembly.pdf | grep -oe '^[0-9]*') | |
originalSize=$(du "$iofile" | grep -oe '^[0-9]*') | |
cp assembly.pdf "$iofile" | |
rm -r "$workdir" | |
echo "Successfully compressed pdf with ratio $(echo "scale=2; $originalSize/$compressedSize" | bc)." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment