Skip to content

Instantly share code, notes, and snippets.

@kbauer
Last active May 16, 2019 02:23
Show Gist options
  • Save kbauer/0b80df2e402ae40f68ff to your computer and use it in GitHub Desktop.
Save kbauer/0b80df2e402ae40f68ff to your computer and use it in GitHub Desktop.
Converts a PDF obtained from scanning into a fax-like monochrome PDF. Assumes that the PDF consists of only a sequence of page-filling images. Requires my other gist imagemagick-scan-to-mono.sh
#!/usr/bin/env bash
# -*- mode: sh; coding: us-ascii-unix -*-
source libstacktrace || true
# set -e -u -E
MANUAL="
Usage: $0 [options] INPUT OUTPUT
$0 --inplace [options] INPUT
Converts a scan-pdf (assuming one image per page) to monochrome.
When OUTPUT is not given, replace the input file instead and create a
backup file.
-f INT, --from-page INT
Process only pages with page number >= INT
-t INT, --to-page INT
Process only pages with page number <= INT
-P, --parallel INT
Process INT pages in parallel each.
-v, --verbose / +v, --noverbose
Enables/Disables verbose reporting.
-i, --inplace
Instead of producing OUTPUT, change INPUT inplace and create a
backup file INPUT.bak.
-d,--density FLOAT
Instead of using 'pdfimages' for image conversion, use imagemagick
with the given number passed to the '-density' flag.
Meant for pdfs that do NOT consist of one image per page and
nothing else. For files that do, this setting will cause a loss of
quality!
-h, -?, --help
Prints this message
"
vecho(){ $VERBOSE && echo "$@" || true; }
vmv(){ $VERBOSE && mv -v "$@" || mv "$@"; }
######### COMMAND LINE PARSING #######################################
declare VERBOSE=false
declare ARGS=()
declare PAGE_LIMIT_LOW=""
declare PAGE_LIMIT_HIGH=""
declare PARALLEL=1
declare INPLACE=false
declare PDFIMAGES=true
## Print manual
if [[ $# -eq 0 ]]; then
echo "$MANUAL"
exit 1
fi
## Getopt-style consumption of arguments ##
##
## Don't forget "shift", don't delete "--" and "*" cases.
while [[ $# -gt 0 ]]; do
case "$1" in
-h|-\?|--help)
echo "$MANUAL"
exit 0
shift ;;
-v|--verbose)
VERBOSE=true
shift ;;
+v|--no-verbose)
VERBOSE=false
shift ;;
-f|--from-page)
PAGE_LIMIT_LOW="-f $2"
shift 2 ;;
-t|--to-page)
PAGE_LIMIT_HIGH="-l $2"
shift 2 ;;
-P|--parallel)
PARALLEL=$2
shift 2 ;;
-i|--inplace)
INPLACE=true
shift ;;
-d|--density)
DENSITY=$2
PDFIMAGES=false
shift 2 ;;
--)
shift
break ;;
*)
ARGS[${#ARGS[@]}]="$1"
shift ;;
esac
done
## Consume stuff remaining after -- ##
while [[ $# -gt 0 ]]; do
ARGS[${#ARGS[@]}]="$1"
shift
done
## Note that ${ARGS[@]} is considered unbound if it is empty!
INFILE=$(readlink -m "${ARGS[0]}")
if ! $INPLACE; then
OUTFILE=$(readlink -m "${ARGS[1]}")
else
OUTFILE=$(mktemp -t "$(basename INFILE .pdf).XXXXXXXXX.pdf")
fi
TMPDIR=$(mktemp -d)
vecho "Using work directory '$TMPDIR'."
cd "$TMPDIR"
vecho "Extracting images from '$INFILE'..."
## Cannot be parallelized, file-locking issue.
if $PDFIMAGES; then
cmd="pdfimages -j $PAGE_LIMIT_LOW $PAGE_LIMIT_HIGH $(printf %q "$INFILE") page"
else
cmd="convert -density $(printf %q "$DENSITY") $(printf %q "$INFILE") page-%03d.png"
fi
# vecho "$cmd"
eval "$cmd" || true
find -name "page-*" -and -not -name "page-*-mono*" \
| xargs -P $PARALLEL -I FILE sh -c "
imagemagick-scan-to-mono.sh FILE FILE-mono.png \
&& convert FILE-mono.png -flatten FILE-mono.pdf \
&& { if $VERBOSE; then echo Finished file 'FILE'; fi; }
rm FILE FILE-mono.png
"
vecho "Assembling PDF file '$OUTFILE'..."
pdftk page-*-mono.pdf cat output out.pdf
mv out.pdf "$OUTFILE"
rm page-*-mono.pdf
rmdir "$TMPDIR" || ls -l
if $INPLACE; then
i=0
while [[ -e $(printf "%s.bak%03d.pdf" "$INFILE" "$i") ]]; do
((i++))
done
vmv "$INFILE" "$(printf "%s.bak%03d.pdf" "$INFILE" "$i")"
vmv "$OUTFILE" "$INFILE"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment