Skip to content

Instantly share code, notes, and snippets.

@lukas-buergi
Created May 25, 2020 13:23
Show Gist options
  • Save lukas-buergi/47a2daea1f03456373c33334165b54b6 to your computer and use it in GitHub Desktop.
Save lukas-buergi/47a2daea1f03456373c33334165b54b6 to your computer and use it in GitHub Desktop.
Post process scanned books
#!/bin/bash
#######################################################################
# Copyright Lukas Bürgi 2020
#
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public
# License along with this file. If not, see
# <https://www.gnu.org/licenses/>.
########################################################################
# How to use: ./compress file.pdf [denoise]
# If you skip the denoise, it should be lossless, but will be extremely
# slow and result in somewhat bigger files.
# With denoise enabled, jpegs will be cropped to remove borders
# containing only noise (leaving a considerable safety margin) and on
# all other images some measures to remove black speckles will be taken
# (which should also be pretty safe, but only if the images are high
# quality and contain text, everything else could be mangled beyond use).
#
# Caveats:
# There are some fixed assumptions which will cause problems if untrue:
# * Each pdf page consists of a single image
# * jpeg pages are covers and the like and can't be compressed further
# * All pages in other formats are black and white text
# There is also a possible bug with jbig2enc which leads to a fixed
# pixel density metadata setting of 72dpi for text pages. To fix this
# you need to modify pdf.py around line 150 to replace
# if xres == 0:
# xres = dpi
# if yres == 0:
# yres = dpi
# with
# xres = dpi
# yres = dpi
# where dpi is a variable that is configured at the beginning of the file
# and needs to be set to the dpi of your scan, for example 600.
#
# Prerequisites:
# Put the files jbig2 and pdf.py from
# https://github.com/agl/jbig2enc into the same directory as this script
# and install
# pdftk imagemagick time pdfimages parallel jpegtran img2pdf
# so that they are on the path (in contrast to the jbig2 utilities, they
# should be included in most distributions so that they are assumed to be
# installed). This file won't work if called from path, i.e. don't try to
# install it.
########################################################################
# make bash behave more like a proper language
set -euo pipefail
shopt -s nullglob
# make debugging easier
set -x
# bin folder
bin=$(pwd)/$(dirname "$0")
# input/output file folder
iofolder="$(dirname "$(realpath "$1")")"
iofile="$iofolder/$(basename "$1")"
exec > "$iofile".log-1.txt
exec 2> "$iofile".log-2.txt
pdfImagesList="$(pdfimages -list "$iofile")"
numberOfImages=$(( $(echo "$pdfImagesList" | wc -l ) - 2 ))
numberOfPages=$(pdftk "$iofile" dump_data | grep -Fe 'NumberOfPages' | grep -oe '[0-9]*')
if test $numberOfImages -ne $numberOfPages; then
echo "File $iofile is not a standard scanned pdf with one image file per page. Skipping."
exit 0
fi
numberOfJpegPages=$(echo "$pdfImagesList" | grep -e 'jpeg' | wc -l || true)
numberOfTextPages=$(( "$numberOfImages" - "$numberOfJpegPages" ))
if test $numberOfTextPages -eq 0; then
echo "File consists of just jpegs, can't do much. Skipping."
exit 0
fi
jbig2presence="$( echo "$pdfImagesList" | grep -oe 'jbig2' || [[ $? == 1 ]] )"
if test "$jbig2presence" != ""; then
echo "Input already contains jbig2 encoded images, it was probably already processed. Skipping."
exit 0
fi
# put all temporary files into this folder
workdir="$iofolder/tmp-$(basename "$1")"
mkdir "$workdir"
# extract original images
SECONDS=0
mkdir "$workdir"/original
pdfimages -tiff -j "$1" "$workdir"/original/page
cd "$workdir"/original
rename 's/page-(...\....)/page-0$1/' *.???
rename 's/page-(....\....)/page-0$1/' *.???
echo "Extracted individual pages from pdf in $SECONDS seconds."
# remove noise if wanted
if test $# -eq 2; then
if test "$2" != "denoise"; then
exit 1
fi
mkdir "$workdir"/denoised
cd "$workdir"/original
# Crop the jpeg files -- comment out if not needed
SECONDS=0
for f in *.jpg; do
#area="$(convert "$f" -virtual-pixel edge -blur 0x100 -fuzz 5% -trim -format '%wx%h%O' info:)"
# the blurring is very slow
# improvement:
originalDimensions=$(identify -format "%[fx:w]x%[fx:h]" "$f")
convert "$f" -resize 1% -virtual-pixel edge -blur 0x2 "$f".small.jpg
# scale back up to exact dimensions (not using factor which would introduce inacceptable rounding errors)
area="$(convert "$f".small.jpg -resize "$originalDimensions" -fuzz 2% -trim -format '%wx%h%O' info:)"
rm "$f".small.jpg
jpegtran -crop "$area" "$f" > ../denoised/"$f"
done
echo "Cropped all extracted jpegs in $SECONDS seconds."
SECONDS=0
for f in *.tif; do
outfile=../denoised/"${f%.tif}.png"
messages="$(convert "$f" -morphology Close Diamond:1 -define connected-components:area-threshold=30 -define connected-components:mean-color=true -connected-components 4 "$outfile" 2>&1 || true)"
if echo "$messages" | grep -e 'too many objects' >/dev/null; then
echo "Non-text detected and left untouched: $f (zero-indexed)."
convert "$f" "$outfile"
fi
done
echo "Despeckled all text pages in $SECONDS seconds."
cd "$workdir"
ln -s denoised/ assemble-input
else
cd "$workdir"/original
SECONDS=0
for f in *.tif; do
sem -j +0 convert "$f" "${f%.tif}.png"
done
sem --wait
echo "Converted all text pages from tiff to png in $SECONDS seconds."
cd "$workdir"
ln -s original/ assemble-input
fi
# wrap jpegs in pdfs
SECONDS=0
cd "$workdir"/assemble-input
for f in *.jpg; do
img2pdf --nodate "$f" --output "$workdir/assemble-input/${f%.jpg}.pdf"
done
if test $numberOfTextPages -gt 0; then
# compress text
cd "$workdir"
mkdir jbig2-output
cd jbig2-output
SECONDS=0
"$bin"/jbig2 -s -p ../assemble-input/*.png 2>/dev/null
echo "Compressed text pages using jbig2 in $SECONDS seconds."
SECONDS=0
"$bin"/pdf.py output > jbig2.pdf
echo "Wrapped jbig2 images in pdf in $SECONDS seconds."
# insert pdf-wrapped jpegs in right place in jbig2-pdf
SECONDS=0
cd "$workdir"/assemble-input
cp ../jbig2-output/jbig2.pdf assembly.pdf
for f in ../original/*.jpg; do
# copy file because I'm not sure if there will be a problem if input and output file are equal
cp assembly.pdf tmp.pdf
# get pdf-file-name
pdfFilename="$(basename "$f" .jpg).pdf"
# get page number from $f
pageNumber="${pdfFilename%.pdf}"
pageNumber="10#${pageNumber#page-}"
pageNumber="$(("$pageNumber"+1))"
# get total number of pages
pageTotal=$(find . -regex '\./page-[0-9]*\....' -prune -print | grep -c /)
# get number of pages concatenated so far
pagesSoFar=$(pdftk tmp.pdf dump_data | grep -Fe 'NumberOfPages' | grep -oe '[0-9]*')
# get range1 and range2
if test 1 -eq "$pageNumber"; then
range1="1even" # ranges left empty default to full, but this contradictory range is truly empty
range2="A"
elif test 2 -eq "$pageNumber"; then
range1="A1"
range2="A2-end"
elif test "$(("$pagesSoFar"+1))" -eq "$pageNumber"; then
range1="A"
range2="1even" # empty
else
range1="A1-$pageNumber"
range2="A$(("$pageNumber" + 1))-end"
fi
# insert pdf at correct place
pdftk A=tmp.pdf I="$pdfFilename" cat "$range1" I "$range2" output assembly.pdf
done
echo "Put jpeg pages into pdf in $SECONDS seconds."
else
cd "$workdir"/assemble-input
pdftk *.pdf cat output assembly.pdf
fi
compressedSize=$(du assembly.pdf | grep -oe '^[0-9]*')
originalSize=$(du "$iofile" | grep -oe '^[0-9]*')
cp assembly.pdf "$iofile"
rm -r "$workdir"
echo "Successfully compressed pdf with ratio $(echo "scale=2; $originalSize/$compressedSize" | bc)."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment