lukas-buergi · May 25, 2020 13:23
diff --git a/compress.sh b/compress.sh
 #!/bin/bash
 #######################################################################
 # Copyright Lukas Bürgi 2020
 #
 # This file is free software: you can redistribute it and/or modify it
 # under the terms of the GNU Affero General Public License as published
 # by the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This file is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
 # License for more details.
 #
 # You should have received a copy of the GNU Affero General Public
 # License along with this file.  If not, see
 # <https://www.gnu.org/licenses/>.
 ########################################################################
 # How to use: ./compress file.pdf [denoise]
 # If you skip the denoise, it should be lossless, but will be extremely
 # slow and result in somewhat bigger files.
 # With denoise enabled, jpegs will be cropped to remove borders
 # containing only noise (leaving a considerable safety margin) and on
 # all other images some measures to remove black speckles will be taken
 # (which should also be pretty safe, but only if the images are high
 # quality and contain text, everything else could be mangled beyond use).
 #
 # Caveats:
 # There are some fixed assumptions which will cause problems if untrue:
 #  * Each pdf page consists of a single image
 #  * jpeg pages are covers and the like and can't be compressed further
 #  * All pages in other formats are black and white text
 # There is also a possible bug with jbig2enc which leads to a fixed
 # pixel density metadata setting of 72dpi for text pages. To fix this
 # you need to modify pdf.py around line 150 to replace
 #    if xres == 0:
 #        xres = dpi
 #    if yres == 0:
 #        yres = dpi
 # with
 #    xres = dpi
 #    yres = dpi
 # where dpi is a variable that is configured at the beginning of the file
 # and needs to be set to the dpi of your scan, for example 600.
 #
 # Prerequisites:
 # Put the files jbig2 and pdf.py from
 # https://github.com/agl/jbig2enc into the same directory as this script
 # and install
 # pdftk imagemagick time pdfimages parallel jpegtran img2pdf
 # so that they are on the path (in contrast to the jbig2 utilities, they
 # should be included in most distributions so that they are assumed to be
 # installed). This file won't work if called from path, i.e. don't try to
 # install it.
 ########################################################################

 # make bash behave more like a proper language
 set -euo pipefail
 shopt -s nullglob

 # make debugging easier
 set -x

 # bin folder
 bin=$(pwd)/$(dirname "$0")

 # input/output file folder
 iofolder="$(dirname "$(realpath "$1")")"
 iofile="$iofolder/$(basename "$1")"

 exec >  "$iofile".log-1.txt
 exec 2> "$iofile".log-2.txt

 pdfImagesList="$(pdfimages -list "$iofile")"

 numberOfImages=$(( $(echo "$pdfImagesList" | wc -l ) - 2 ))
 numberOfPages=$(pdftk "$iofile" dump_data | grep -Fe 'NumberOfPages' | grep -oe '[0-9]*')
 if test $numberOfImages -ne $numberOfPages; then
  echo "File $iofile is not a standard scanned pdf with one image file per page. Skipping."
  exit 0
 fi

 numberOfJpegPages=$(echo "$pdfImagesList" | grep -e 'jpeg' | wc -l || true)
 numberOfTextPages=$(( "$numberOfImages" - "$numberOfJpegPages" ))
 if test $numberOfTextPages -eq 0; then
  echo "File consists of just jpegs, can't do much. Skipping."
  exit 0
 fi

 jbig2presence="$( echo "$pdfImagesList" | grep -oe 'jbig2' || [[ $? == 1 ]] )"
 if test "$jbig2presence" != ""; then
  echo "Input already contains jbig2 encoded images, it was probably already processed. Skipping."
  exit 0
 fi

 # put all temporary files into this folder
 workdir="$iofolder/tmp-$(basename "$1")"
 mkdir "$workdir"

 # extract original images
 SECONDS=0
 mkdir "$workdir"/original
 pdfimages -tiff -j "$1" "$workdir"/original/page
 cd "$workdir"/original
 rename 's/page-(...\....)/page-0$1/' *.???
 rename 's/page-(....\....)/page-0$1/' *.???
 echo "Extracted individual pages from pdf in $SECONDS seconds." 

 # remove noise if wanted
 if test $# -eq 2; then
  if test "$2" != "denoise"; then
    exit 1
  fi
  mkdir "$workdir"/denoised
  cd "$workdir"/original
  
  # Crop the jpeg files -- comment out if not needed
  SECONDS=0
  for f in *.jpg; do
    #area="$(convert "$f" -virtual-pixel edge -blur 0x100 -fuzz 5% -trim -format '%wx%h%O' info:)"
    # the blurring is very slow
    # improvement:
    originalDimensions=$(identify -format "%[fx:w]x%[fx:h]" "$f")
    convert "$f" -resize 1% -virtual-pixel edge -blur 0x2 "$f".small.jpg
    # scale back up to exact dimensions (not using factor which would introduce inacceptable rounding errors)
    area="$(convert "$f".small.jpg -resize "$originalDimensions" -fuzz 2% -trim -format '%wx%h%O' info:)"
    rm "$f".small.jpg
    jpegtran -crop "$area" "$f" > ../denoised/"$f"
  done
  echo "Cropped all extracted jpegs in $SECONDS seconds."
  
  SECONDS=0
  for f in *.tif; do
    outfile=../denoised/"${f%.tif}.png"
    messages="$(convert "$f" -morphology Close Diamond:1 -define connected-components:area-threshold=30 -define connected-components:mean-color=true -connected-components 4 "$outfile" 2>&1 || true)"

    if echo "$messages" | grep -e 'too many objects' >/dev/null; then
      echo "Non-text detected and left untouched: $f (zero-indexed)."
      convert "$f" "$outfile"
    fi
  done
  echo "Despeckled all text pages in $SECONDS seconds."
  cd "$workdir"
  ln -s denoised/ assemble-input
 else
  cd "$workdir"/original
  SECONDS=0
  for f in *.tif; do
    sem -j +0 convert "$f" "${f%.tif}.png"
  done
  sem --wait
  echo "Converted all text pages from tiff to png in $SECONDS seconds."
  cd "$workdir"
  ln -s original/ assemble-input
 fi

 # wrap jpegs in pdfs
 SECONDS=0
 cd "$workdir"/assemble-input 
 for f in *.jpg; do
  img2pdf --nodate "$f" --output "$workdir/assemble-input/${f%.jpg}.pdf"
 done

 if test $numberOfTextPages -gt 0; then
  # compress text
  cd "$workdir"
  mkdir jbig2-output
  cd jbig2-output

  SECONDS=0
  "$bin"/jbig2 -s -p ../assemble-input/*.png 2>/dev/null
  echo "Compressed text pages using jbig2 in $SECONDS seconds."

  SECONDS=0
  "$bin"/pdf.py output > jbig2.pdf
  echo "Wrapped jbig2 images in pdf in $SECONDS seconds."

  # insert pdf-wrapped jpegs in right place in jbig2-pdf
  SECONDS=0
  cd "$workdir"/assemble-input
  cp ../jbig2-output/jbig2.pdf assembly.pdf
  for f in ../original/*.jpg; do
    # copy file because I'm not sure if there will be a problem if input and output file are equal
    cp assembly.pdf tmp.pdf
    
    # get pdf-file-name
    pdfFilename="$(basename "$f" .jpg).pdf"

    # get page number from $f
    pageNumber="${pdfFilename%.pdf}"
    pageNumber="10#${pageNumber#page-}"
    pageNumber="$(("$pageNumber"+1))"
    
    # get total number of pages
    pageTotal=$(find . -regex '\./page-[0-9]*\....' -prune -print | grep -c /)
    
    # get number of pages concatenated so far
    pagesSoFar=$(pdftk tmp.pdf dump_data | grep -Fe 'NumberOfPages' | grep -oe '[0-9]*')
    
    # get range1 and range2
    if test 1 -eq "$pageNumber"; then
      range1="1even" # ranges left empty default to full, but this contradictory range is truly empty
      range2="A"
    elif test 2 -eq "$pageNumber"; then
      range1="A1"
      range2="A2-end"
    elif test "$(("$pagesSoFar"+1))" -eq "$pageNumber"; then
      range1="A"
      range2="1even" # empty
    else
      range1="A1-$pageNumber"
      range2="A$(("$pageNumber" + 1))-end"
    fi
  
    # insert pdf at correct place
    pdftk A=tmp.pdf I="$pdfFilename" cat "$range1" I "$range2" output assembly.pdf
  done
  echo "Put jpeg pages into pdf in $SECONDS seconds."
 else
  cd "$workdir"/assemble-input
  pdftk *.pdf cat output assembly.pdf
 fi

 compressedSize=$(du assembly.pdf | grep -oe '^[0-9]*')
 originalSize=$(du "$iofile" | grep -oe '^[0-9]*')
 cp assembly.pdf "$iofile"
 rm -r "$workdir"
 echo "Successfully compressed pdf with ratio $(echo "scale=2; $originalSize/$compressedSize" | bc)."
	#!/bin/bash
	#######################################################################
	# Copyright Lukas Bürgi 2020
	#
	# This file is free software: you can redistribute it and/or modify it
	# under the terms of the GNU Affero General Public License as published
	# by the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This file is distributed in the hope that it will be useful, but WITHOUT
	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
	# License for more details.
	#
	# You should have received a copy of the GNU Affero General Public
	# License along with this file. If not, see
	# <https://www.gnu.org/licenses/>.
	########################################################################
	# How to use: ./compress file.pdf [denoise]
	# If you skip the denoise, it should be lossless, but will be extremely
	# slow and result in somewhat bigger files.
	# With denoise enabled, jpegs will be cropped to remove borders
	# containing only noise (leaving a considerable safety margin) and on
	# all other images some measures to remove black speckles will be taken
	# (which should also be pretty safe, but only if the images are high
	# quality and contain text, everything else could be mangled beyond use).
	#
	# Caveats:
	# There are some fixed assumptions which will cause problems if untrue:
	# * Each pdf page consists of a single image
	# * jpeg pages are covers and the like and can't be compressed further
	# * All pages in other formats are black and white text
	# There is also a possible bug with jbig2enc which leads to a fixed
	# pixel density metadata setting of 72dpi for text pages. To fix this
	# you need to modify pdf.py around line 150 to replace
	# if xres == 0:
	# xres = dpi
	# if yres == 0:
	# yres = dpi
	# with
	# xres = dpi
	# yres = dpi
	# where dpi is a variable that is configured at the beginning of the file
	# and needs to be set to the dpi of your scan, for example 600.
	#
	# Prerequisites:
	# Put the files jbig2 and pdf.py from
	# https://github.com/agl/jbig2enc into the same directory as this script
	# and install
	# pdftk imagemagick time pdfimages parallel jpegtran img2pdf
	# so that they are on the path (in contrast to the jbig2 utilities, they
	# should be included in most distributions so that they are assumed to be
	# installed). This file won't work if called from path, i.e. don't try to
	# install it.
	########################################################################

	# make bash behave more like a proper language
	set -euo pipefail
	shopt -s nullglob

	# make debugging easier
	set -x

	# bin folder
	bin=$(pwd)/$(dirname "$0")

	# input/output file folder
	iofolder="$(dirname "$(realpath "$1")")"
	iofile="$iofolder/$(basename "$1")"

	exec > "$iofile".log-1.txt
	exec 2> "$iofile".log-2.txt

	pdfImagesList="$(pdfimages -list "$iofile")"

	numberOfImages=$(( $(echo "$pdfImagesList" \| wc -l ) - 2 ))
	numberOfPages=$(pdftk "$iofile" dump_data \| grep -Fe 'NumberOfPages' \| grep -oe '[0-9]*')
	if test $numberOfImages -ne $numberOfPages; then
	echo "File $iofile is not a standard scanned pdf with one image file per page. Skipping."
	exit 0
	fi

	numberOfJpegPages=$(echo "$pdfImagesList" \| grep -e 'jpeg' \| wc -l \|\| true)
	numberOfTextPages=$(( "$numberOfImages" - "$numberOfJpegPages" ))
	if test $numberOfTextPages -eq 0; then
	echo "File consists of just jpegs, can't do much. Skipping."
	exit 0
	fi

	jbig2presence="$( echo "$pdfImagesList" \| grep -oe 'jbig2' \|\| [[ $? == 1 ]] )"
	if test "$jbig2presence" != ""; then
	echo "Input already contains jbig2 encoded images, it was probably already processed. Skipping."
	exit 0
	fi

	# put all temporary files into this folder
	workdir="$iofolder/tmp-$(basename "$1")"
	mkdir "$workdir"

	# extract original images
	SECONDS=0
	mkdir "$workdir"/original
	pdfimages -tiff -j "$1" "$workdir"/original/page
	cd "$workdir"/original
	rename 's/page-(...\....)/page-0$1/' *.???
	rename 's/page-(....\....)/page-0$1/' *.???
	echo "Extracted individual pages from pdf in $SECONDS seconds."

	# remove noise if wanted
	if test $# -eq 2; then
	if test "$2" != "denoise"; then
	exit 1
	fi
	mkdir "$workdir"/denoised
	cd "$workdir"/original

	# Crop the jpeg files -- comment out if not needed
	SECONDS=0
	for f in *.jpg; do
	#area="$(convert "$f" -virtual-pixel edge -blur 0x100 -fuzz 5% -trim -format '%wx%h%O' info:)"
	# the blurring is very slow
	# improvement:
	originalDimensions=$(identify -format "%[fx:w]x%[fx:h]" "$f")
	convert "$f" -resize 1% -virtual-pixel edge -blur 0x2 "$f".small.jpg
	# scale back up to exact dimensions (not using factor which would introduce inacceptable rounding errors)
	area="$(convert "$f".small.jpg -resize "$originalDimensions" -fuzz 2% -trim -format '%wx%h%O' info:)"
	rm "$f".small.jpg
	jpegtran -crop "$area" "$f" > ../denoised/"$f"
	done
	echo "Cropped all extracted jpegs in $SECONDS seconds."

	SECONDS=0
	for f in *.tif; do
	outfile=../denoised/"${f%.tif}.png"
	messages="$(convert "$f" -morphology Close Diamond:1 -define connected-components:area-threshold=30 -define connected-components:mean-color=true -connected-components 4 "$outfile" 2>&1 \|\| true)"

	if echo "$messages" \| grep -e 'too many objects' >/dev/null; then
	echo "Non-text detected and left untouched: $f (zero-indexed)."
	convert "$f" "$outfile"
	fi
	done
	echo "Despeckled all text pages in $SECONDS seconds."
	cd "$workdir"
	ln -s denoised/ assemble-input
	else
	cd "$workdir"/original
	SECONDS=0
	for f in *.tif; do
	sem -j +0 convert "$f" "${f%.tif}.png"
	done
	sem --wait
	echo "Converted all text pages from tiff to png in $SECONDS seconds."
	cd "$workdir"
	ln -s original/ assemble-input
	fi

	# wrap jpegs in pdfs
	SECONDS=0
	cd "$workdir"/assemble-input
	for f in *.jpg; do
	img2pdf --nodate "$f" --output "$workdir/assemble-input/${f%.jpg}.pdf"
	done

	if test $numberOfTextPages -gt 0; then
	# compress text
	cd "$workdir"
	mkdir jbig2-output
	cd jbig2-output

	SECONDS=0
	"$bin"/jbig2 -s -p ../assemble-input/*.png 2>/dev/null
	echo "Compressed text pages using jbig2 in $SECONDS seconds."

	SECONDS=0
	"$bin"/pdf.py output > jbig2.pdf
	echo "Wrapped jbig2 images in pdf in $SECONDS seconds."

	# insert pdf-wrapped jpegs in right place in jbig2-pdf
	SECONDS=0
	cd "$workdir"/assemble-input
	cp ../jbig2-output/jbig2.pdf assembly.pdf
	for f in ../original/*.jpg; do
	# copy file because I'm not sure if there will be a problem if input and output file are equal
	cp assembly.pdf tmp.pdf

	# get pdf-file-name
	pdfFilename="$(basename "$f" .jpg).pdf"

	# get page number from $f
	pageNumber="${pdfFilename%.pdf}"
	pageNumber="10#${pageNumber#page-}"
	pageNumber="$(("$pageNumber"+1))"

	# get total number of pages
	pageTotal=$(find . -regex '\./page-[0-9]*\....' -prune -print \| grep -c /)

	# get number of pages concatenated so far
	pagesSoFar=$(pdftk tmp.pdf dump_data \| grep -Fe 'NumberOfPages' \| grep -oe '[0-9]*')

	# get range1 and range2
	if test 1 -eq "$pageNumber"; then
	range1="1even" # ranges left empty default to full, but this contradictory range is truly empty
	range2="A"
	elif test 2 -eq "$pageNumber"; then
	range1="A1"
	range2="A2-end"
	elif test "$(("$pagesSoFar"+1))" -eq "$pageNumber"; then
	range1="A"
	range2="1even" # empty
	else
	range1="A1-$pageNumber"
	range2="A$(("$pageNumber" + 1))-end"
	fi

	# insert pdf at correct place
	pdftk A=tmp.pdf I="$pdfFilename" cat "$range1" I "$range2" output assembly.pdf
	done
	echo "Put jpeg pages into pdf in $SECONDS seconds."
	else
	cd "$workdir"/assemble-input
	pdftk *.pdf cat output assembly.pdf
	fi

	compressedSize=$(du assembly.pdf \| grep -oe '^[0-9]*')
	originalSize=$(du "$iofile" \| grep -oe '^[0-9]*')
	cp assembly.pdf "$iofile"
	rm -r "$workdir"
	echo "Successfully compressed pdf with ratio $(echo "scale=2; $originalSize/$compressedSize" \| bc)."