Last active
September 4, 2024 18:52
-
-
Save brechtm/891de9f72516c1b2cbc1 to your computer and use it in GitHub Desktop.
Page-by-page PDF diff tool
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# usage: diffpdf.sh file_1.pdf file_2.pdf | |
# requirements: | |
# - ImageMagick | |
# - Poppler's pdftoppm and pdfinfo tools (works with 0.18.4 and 0.41.0, | |
# fails with 0.42.0) | |
# (could be replaced with Ghostscript if speed is | |
# not important - see commented commands below) | |
DIFFDIR="pdfdiff" # directory to place diff images in | |
MAXPROCS=$(getconf _NPROCESSORS_ONLN) # number of parallel processes | |
pdf_file1=$1 | |
pdf_file2=$2 | |
function diff_page { | |
# based on http://stackoverflow.com/a/33673440/438249 | |
pdf_file1=$1 | |
pdf_file2=$2 | |
page_number=$3 | |
page_index=$(($page_number - 1)) | |
# 2+x faster | |
(cat $pdf_file1 | pdftoppm -f $page_number -singlefile -gray - | convert - miff:- ; \ | |
cat $pdf_file2 | pdftoppm -f $page_number -singlefile -gray - | convert - miff:- ) | \ | |
convert - \( -clone 0-1 -compose darken -composite \) \ | |
-channel RGB -combine $DIFFDIR/$page_number.jpg | |
# 2x faster (breaks when using TIFF format instead of JPEG, and PNG is slow) | |
# (pdftocairo -f $page_number -singlefile -jpeg $pdf_file1 -gray - | convert - miff:- ; \ | |
# pdftocairo -f $page_number -singlefile -jpeg $pdf_file2 -gray - | convert - miff:- ) | \ | |
# convert - \( -clone 0-1 -compose darken -composite \) \ | |
# -channel RGB -combine $DIFFDIR/$page_number.jpg | |
# 1x (using Ghostscript for PDF to bitmap conversion) | |
# convert -respect-parenthesis \ | |
# \( $pdf_file1[$page_index] -flatten -colorspace gray \) \ | |
# \( $pdf_file2[$page_index] -flatten -colorspace gray \) \ | |
# \( -clone 0-1 -compose darken -composite \) \ | |
# -channel RGB -combine $DIFFDIR/$page_number.jpg | |
# compare $pdf_file1[$page_index] $pdf_file2[$page_index] \ | |
# -highlight-color blue $DIFFDIR/$page_number.png | |
if (($? > 0)); then | |
echo "Problem running pdftoppm or convert!" | |
exit 1 | |
fi | |
grayscale=$(convert $DIFFDIR/$page_number.jpg -colorspace HSL -channel g -separate +channel -format "%[fx:mean]" info:) | |
if [ "$grayscale" != "0" ]; then | |
echo "page $page_number ($grayscale)" | |
return 1 | |
fi | |
return 0 | |
} | |
function num_pages { | |
pdf_file=$1 | |
pdfinfo $pdf_file | grep "Pages:" | awk '{print $2}' | |
} | |
function minimum { | |
echo $(( $1 < $2 ? $1 : $2 )) | |
} | |
# guard against accidental deletion of files in the root directory | |
if [ -z "$DIFFDIR" ]; then | |
echo "DIFFDIR needs to be set!" | |
exit 1 | |
fi | |
pdf1_num_pages=$(num_pages $pdf_file1) | |
pdf2_num_pages=$(num_pages $pdf_file2) | |
min_pages=$(minimum $pdf1_num_pages $pdf2_num_pages) | |
if [ "$pdf1_num_pages" -ne "$pdf2_num_pages" ]; then | |
echo "PDF files have different lengths ($pdf1_num_pages and $pdf2_num_pages)" | |
rc=1 | |
fi | |
if [ -d "$DIFFDIR" ]; then | |
rm -f $DIFFDIR/* | |
else | |
mkdir $DIFFDIR | |
fi | |
# get exit status from subshells http://stackoverflow.com/a/29535256/438249 | |
function wait_for_processes { | |
local rc=0 | |
while (( "$#" )); do | |
# wait returns the exit status for the process | |
if ! wait "$1"; then | |
rc=1 | |
fi | |
shift | |
done | |
return $rc | |
} | |
function howmany() { | |
echo $# | |
} | |
rc=0 | |
pids="" | |
for page_number in `seq 1 $min_pages`; | |
do | |
diff_page $pdf_file1 $pdf_file2 $page_number & | |
pids+=" $!" | |
if [ $(howmany $pids) -eq "$MAXPROCS" ]; then | |
if ! wait_for_processes $pids; then | |
rc=1 | |
fi | |
pids="" | |
fi | |
done | |
if ! wait_for_processes $pids; then | |
rc=1 | |
fi | |
exit $rc |
who uploads an image of their code.. thanks pal
@WillCohenInfotrack
Its a good example of the task save the source as a PDF save the above SVG suggestions as a PDF and run a PDF differ to see how it performs :-) 🤣
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the updates. I needed to customize it for my use case, and I already know bash and not python, so this version made more sense for me.