Skip to content

Instantly share code, notes, and snippets.

@iamlemec
Last active October 18, 2017 20:47
Show Gist options
  • Save iamlemec/fd0ae0bd09a65179b1336369c3dd21fc to your computer and use it in GitHub Desktop.
Save iamlemec/fd0ae0bd09a65179b1336369c3dd21fc to your computer and use it in GitHub Desktop.
Diff between two PDFs. Crude, but useful for revisions. Requires wdiff and pdftotext. See diff.sh for usage.
#!/bin/sh
# Use this instead of diff[1] to get colored[2] word-based diffs.
# Useful for text documents that have reflowed paragraphs.
# Requires that wdiff is installed in your $PATH.
#
# [1] All diff options are ignored. Only replaces simplest usage.
# [2] Colors are always emitted. If piping into less, use "-R" or set LESS=-R
# Iain Murray, February 2009, Tweaked in June 2011
if [ "$#" -lt 2 ] ; then
echo 'Usage: cwdiff FILE1 FILE2'
echo ' cwdiff [any options will be discarded] FILE1 FILE2'
echo
echo 'cwdiff emits colored, word-based diffs. Requires wdiff(1).'
echo 'If piping into less use -R option or set LESS=-R'
echo
exit
fi
# Discard any options, for example SVN attempts to set a bunch of GNU diff
# options
shift $(($# - 2))
# Color commands adapted from cdiff alias by
# Paul Warren <[email protected]> 12/01/2001
# http://ex-parrot.com/pdw/cdiff.html
# colour for added lines (bright blue)
# (yellow, used in cdiff, alias is less robust to terminal background colour)
diffnew=`tput setf 1``tput bold`
# colour for removed lines (bright red)
diffold=`tput setf 4``tput bold`
# reset - original pair, unset all attributes
reset=`tput op``tput sgr0`
# The -3 option to wdiff gives no context, so I'm getting wdiff to spit out the
# whole document and using grep to trim it down. The regex for grep could be
# improved, but for text documents is probably ok, and at worst will include
# some extra output that can be ignored..
wdiff \
--start-delete "${diffold}[-" \
--end-delete "-]${reset}" \
--start-insert "${diffnew}{+" \
--end-insert "+}${reset}" \
-n "$1" "$2" \
| grep -C2 '\['
# CAREFUL: The line above contains an escape character.
# Don't copy the grep command by typing ^[ instead of 
# This grep line hits false positives more often for me:
# | grep -C2 '[{[][-+].*[-+][]}]'
#! /bin/bash
# Copyright (C) 2009-2015 Christoph Junghans
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#version 0.1 04.08.10 -- initial version
#version 0.1.1, 05.08.10 -- added -D and -o
#version 0.1.2, 04.10.10 -- make -D work again and better help
#version 0.1.3, 10.05.11 -- added --text to diff_opts to allow diff of binary files (issue #3)
#version 0.1.4, 10.05.11 -- removed --text again and handle case of binary files
#version 0.2.0, 15.04.12 -- clean up + bugfix thanks to Arne
#version 0.2.1, 15.04.12 -- clean up + new bug report address
#version 0.2.2, 20.04.12 -- fixed a bug if argument were dirs
#version 0.2.3, 18.12.12 -- replace usage of mktemp with bash built-ins
#version 0.2.4, 31.08.13 -- fixed deleted leading spaces
#version 0.2.5, 06.09.13 -- do not expand escape sequences in diff
#version 0.2.6, 18.09.13 -- allow 1st argument begin a file with --filter
#version 0.2.7, 09.03.15 -- included a patch from NetBSD
#version 0.2.8, 13.03.15 -- moved issue tracker to github
#version 0.3.0, 15.03.15 -- print help2man compliant help and version
#version 0.3.1, 30.06.15 -- fix --diffopts option (fixes #4)
#version 0.3.2, 12.11.15 -- make output processable by less -r/-R
#version 0.4.0, 12.11.15 -- dropped a2ps and out option, clean up
FAT_GREEN=""
GREEN=""
FAT_RED=""
RED="" #also used as indicator
MAGENTA=""
FAT_BLACK=""
OFF=""
NL="
"
usage="Usage: ${0##*/} [OPTIONS] FILE1 FILE2"
diff="no"
filter=
diff_opts='--new-file --unified --show-c-function --recursive'
ab=
color_filter() {
#use RED as an indicator if colors are on
if [[ ${RED} ]]; then
sed -e "s/\[-/$RED/g" -e "s/-\]/$OFF/g" -e "s/{+/$GREEN/g" -e "s/+}/$OFF/g" $1
else
cat $1
fi
}
die() {
[[ $* ]] && echo "$*"
exit 1
}
show_help () {
cat << eof
A colorized version of (w)diff, which can be used a wrapper script, too.
$usage
Options:
-f, --filter Act as a wdiff color filter only and don't execute diff/wdiff
internally, just colorize input (no ARGS = read from stdin)
-d, --diff Preprocess input with diff and before giving it to wdiff
(very useful for dirs). Option can be used in combination
with --filter option meaning input is a patch.
-D, --diffonly Process input with diff, but NOT with wdiff, so ${0##*/}
basically acts like a colorized version of diff. Option
can be used in combination with --filter option meaning
input is a patch.
--diffopts XXX Change options given to diff. The default is '$diff_opts'.
--ab Replace common trunc of dirname by 'a' and 'b'
--no-color Disable color
-- Stop parsing options
-h, --help Show this help
-v, --version Show version
Examples:
${0##*/} -d dir1 dir2 Compare two directories
${0##*/} file1 file2 Compare two files
${0##*/} --ab -D dir1 dir2 VCS like output from compaing two directories
wdiff file1 file2 | ${0##*/} -f Colorize the output of wdiff
diff -u file1 file2 | ${0##*/} -D -f Colorize the output of diff
Report bugs and comments at https://github.com/junghans/cwdiff/issues or [email protected]
eof
}
shopt -s extglob
while [[ ${1} = -?* ]]; do
if [[ ${1} = --??*=* ]]; then # case --xx=yy
set -- "${1%%=*}" "${1#*=}" "${@:2}" # --xx=yy to --xx yy
elif [[ ${1} = -[^-]?* ]]; then # case -xy split
if [[ ${1} = -[o]* ]]; then #short opts with arguments
set -- "${1:0:2}" "${1:2}" "${@:2}" # -xy to -x y
else #short opts without arguments
set -- "${1:0:2}" "-${1:2}" "${@:2}" # -xy to -x -y
fi
fi
case $1 in
--ab)
ab="yes"
shift;;
--no-color)
unset FAT_GREEN GREEN FAT_RED RED MAGENTA OFF FAT_BLACK
shift;;
-f | --filter)
filter="yes"
shift ;;
-d | --diff)
diff="yes"
shift ;;
-D | --diffonly)
diff="only"
shift ;;
--diffopts)
diff_opts="$2"
shift 2;;
-h | --help)
show_help
exit 0;;
--vcs)
echo "${0##*/}: $(sed -ne 's/^#version.* -- \(.*$\)/\1/p' $0 | sed -n '$p')"
exit 0;;
-v | --version)
cat << eov
${0##*/} $(sed -ne 's/^#version \(.*\) -- .*$/\1/p' "$0" | sed -n '$p')
$(sed -ne 's/^# \(Copyright.*\)/\1/p' "$0")
This is free software: you are free to change and redistribute it.
License GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl2.html>
There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Written by C. Junghans <[email protected]>
eov
exit 0;;
--)
shift
break;;
*)
die "Unknown option '$1'";;
esac
done
[[ $1 = - ]] && filter="yes" && shift
if [[ ! $filter ]]; then
[[ ! $1 || ! $2 ]] && die "Please specify two files/dirs or add --filter option"
#use -e as it could be file or dir
[[ -e $1 ]] || die "Could not read file/dir '$1'"
[[ -e $2 ]] || die "Could not read file/dir '$2'"
[[ $3 ]] && die "I don't know what to do with arguments '${@:3}'"
else
[[ -n $1 && ! -f $1 ]] && die "Could not read file '$1'"
[[ $2 ]] && die "I don't know what to do with arguments '$*' together --filter option"
fi
if [[ $diff != no ]]; then
if [[ ! $filter ]]; then
exec 3< <(diff $diff_opts "$1" "$2")
#don't die here, because diff of binary files give exit code = 2
else
[[ $1 ]] && exec 3<$1 || exec 3<&0
fi
# don't do this if we have not files ;-)
if [[ $ab && $1 && $2 ]]; then
#find the longest equal part in $1 and $2 from the end
for ((i=1;i<=(${#1}<${#2}?${#1}:${#2});i++)); do
[[ ${1:0-$i} != ${2:0-$i} ]] && break
done
((i--))
a="${1:0:${#1}-$i}"
b="${2:0:${#2}-$i}"
else
a=a; b=b
fi
# -r to not expand escape sequences in diff
while read -r -u 3; do
#small trick, because "read -u 3 i" would split the line and
#leading space would be lost.
i="${REPLY}"
if [[ $i = "Files "*" and "*" differ" ]]; then # binary case
i="${i/$a/a}"
i="${i/$b/b}"
echo "$i"
elif [[ $i = diff* ]]; then # diff header line
i="${i/ $diff_opts}"
i="${i/$a/a}"
echo "$FAT_BLACK${i/$b/b}$OFF"
elif [[ $i = ---* ]]; then
echo "${FAT_RED}${i/$a/a}${OFF}"
elif [[ $i = +++* ]]; then
echo "${FAT_GREEN}${i/$b/b}${OFF}"
elif [[ $i = @@* ]]; then
echo "${MAGENTA}${i}${OFF}"
elif [[ $i = -* ]]; then
[[ $diff = only ]] && echo "${RED}${i}${OFF}" || t1+="${i#-}$NL"
elif [[ $i = +* ]]; then
[[ $diff = only ]] && echo "${GREEN}${i}${OFF}" || t2+="${i#+}$NL"
else
# only true for diff != only
# cut the last newline do avoid an empty line at the end (echo append newline)
# echo -n would also work, but wdiff has strange behaviour if the 2nd file is
# empty, it will not append newline, which make the output look strange
[[ $t1 || $t2 ]] && { wdiff ${RED:+-n} <(echo "${t1%$NL}") <(echo "${t2%$NL}") | color_filter; }
t1=
t2=
[[ $diff = only ]] && echo "${i}" || echo "${i## }"
fi
done
# thanks to Arne Babenhauserheide for pointing out this case is missing
# if there was + or - lines at the end, which has not been printed yet
[[ $t1 || $t2 ]] && { wdiff ${RED:+-n} <(echo "${t1%$NL}") <(echo "${t2%$NL}") | color_filter; }
elif [[ $filter ]]; then
color_filter $1
else
wdiff ${RED:+-n} "$1" "$2" | color_filter
fi
# sample usage
# cwdiff gives {+added+} and {-removed-} in highlighted text
# cwdiff2 gives add words in green and removed words in red
pdftotext old.pdf old.txt
pdftotext new.pdf new.txt
cwdiff old.txt new.txt > diff.txt
less -N diff.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment