Last active
May 16, 2019 02:23
-
-
Save kbauer/0b80df2e402ae40f68ff to your computer and use it in GitHub Desktop.
Converts a PDF obtained from scanning into a fax-like monochrome PDF. Assumes that the PDF consists of only a sequence of page-filling images. Requires my other gist imagemagick-scan-to-mono.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# -*- mode: sh; coding: us-ascii-unix -*- | |
source libstacktrace || true | |
# set -e -u -E | |
MANUAL=" | |
Usage: $0 [options] INPUT OUTPUT | |
$0 --inplace [options] INPUT | |
Converts a scan-pdf (assuming one image per page) to monochrome. | |
When OUTPUT is not given, replace the input file instead and create a | |
backup file. | |
-f INT, --from-page INT | |
Process only pages with page number >= INT | |
-t INT, --to-page INT | |
Process only pages with page number <= INT | |
-P, --parallel INT | |
Process INT pages in parallel each. | |
-v, --verbose / +v, --noverbose | |
Enables/Disables verbose reporting. | |
-i, --inplace | |
Instead of producing OUTPUT, change INPUT inplace and create a | |
backup file INPUT.bak. | |
-d,--density FLOAT | |
Instead of using 'pdfimages' for image conversion, use imagemagick | |
with the given number passed to the '-density' flag. | |
Meant for pdfs that do NOT consist of one image per page and | |
nothing else. For files that do, this setting will cause a loss of | |
quality! | |
-h, -?, --help | |
Prints this message | |
" | |
vecho(){ $VERBOSE && echo "$@" || true; } | |
vmv(){ $VERBOSE && mv -v "$@" || mv "$@"; } | |
######### COMMAND LINE PARSING ####################################### | |
declare VERBOSE=false | |
declare ARGS=() | |
declare PAGE_LIMIT_LOW="" | |
declare PAGE_LIMIT_HIGH="" | |
declare PARALLEL=1 | |
declare INPLACE=false | |
declare PDFIMAGES=true | |
## Print manual | |
if [[ $# -eq 0 ]]; then | |
echo "$MANUAL" | |
exit 1 | |
fi | |
## Getopt-style consumption of arguments ## | |
## | |
## Don't forget "shift", don't delete "--" and "*" cases. | |
while [[ $# -gt 0 ]]; do | |
case "$1" in | |
-h|-\?|--help) | |
echo "$MANUAL" | |
exit 0 | |
shift ;; | |
-v|--verbose) | |
VERBOSE=true | |
shift ;; | |
+v|--no-verbose) | |
VERBOSE=false | |
shift ;; | |
-f|--from-page) | |
PAGE_LIMIT_LOW="-f $2" | |
shift 2 ;; | |
-t|--to-page) | |
PAGE_LIMIT_HIGH="-l $2" | |
shift 2 ;; | |
-P|--parallel) | |
PARALLEL=$2 | |
shift 2 ;; | |
-i|--inplace) | |
INPLACE=true | |
shift ;; | |
-d|--density) | |
DENSITY=$2 | |
PDFIMAGES=false | |
shift 2 ;; | |
--) | |
shift | |
break ;; | |
*) | |
ARGS[${#ARGS[@]}]="$1" | |
shift ;; | |
esac | |
done | |
## Consume stuff remaining after -- ## | |
while [[ $# -gt 0 ]]; do | |
ARGS[${#ARGS[@]}]="$1" | |
shift | |
done | |
## Note that ${ARGS[@]} is considered unbound if it is empty! | |
INFILE=$(readlink -m "${ARGS[0]}") | |
if ! $INPLACE; then | |
OUTFILE=$(readlink -m "${ARGS[1]}") | |
else | |
OUTFILE=$(mktemp -t "$(basename INFILE .pdf).XXXXXXXXX.pdf") | |
fi | |
TMPDIR=$(mktemp -d) | |
vecho "Using work directory '$TMPDIR'." | |
cd "$TMPDIR" | |
vecho "Extracting images from '$INFILE'..." | |
## Cannot be parallelized, file-locking issue. | |
if $PDFIMAGES; then | |
cmd="pdfimages -j $PAGE_LIMIT_LOW $PAGE_LIMIT_HIGH $(printf %q "$INFILE") page" | |
else | |
cmd="convert -density $(printf %q "$DENSITY") $(printf %q "$INFILE") page-%03d.png" | |
fi | |
# vecho "$cmd" | |
eval "$cmd" || true | |
find -name "page-*" -and -not -name "page-*-mono*" \ | |
| xargs -P $PARALLEL -I FILE sh -c " | |
imagemagick-scan-to-mono.sh FILE FILE-mono.png \ | |
&& convert FILE-mono.png -flatten FILE-mono.pdf \ | |
&& { if $VERBOSE; then echo Finished file 'FILE'; fi; } | |
rm FILE FILE-mono.png | |
" | |
vecho "Assembling PDF file '$OUTFILE'..." | |
pdftk page-*-mono.pdf cat output out.pdf | |
mv out.pdf "$OUTFILE" | |
rm page-*-mono.pdf | |
rmdir "$TMPDIR" || ls -l | |
if $INPLACE; then | |
i=0 | |
while [[ -e $(printf "%s.bak%03d.pdf" "$INFILE" "$i") ]]; do | |
((i++)) | |
done | |
vmv "$INFILE" "$(printf "%s.bak%03d.pdf" "$INFILE" "$i")" | |
vmv "$OUTFILE" "$INFILE" | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment