To diff common binary files in git using appropriate external converters such as unrtf, pandoc, docx2txt.pl, odt2txt, git-xlsx-textconv, git-xlsx-textconv.pl or pptx2md, add to ~/.config/git/config the lines
[diff]
[diff "pdf"]
binary = true
textconv = "f(){ if command -v pdftotext >/dev/null 2>&1; then pdftotext -layout -enc UTF-8 -nopgbrk -q \"$1\" -; else cat \"$1\"; fi; }; f"
# textconv = pdfinfo
cachetextconv = true
[diff "djvu"]
binary = true
textconv = "f(){ if command -v djvutxt >/dev/null 2>&1; then djvutxt \"$1\"; else cat \"$1\"; fi; }; f"
# textconv = pdfinfo
cachetextconv = true
[diff "odt"]
# Fallback order: odt2txt → pandoc → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v odt2txt >/dev/null 2>&1; then odt2txt \"$in\" && exit 0; fi; \
if command -v pandoc >/dev/null 2>&1; then pandoc --standalone --from=odt --to=plain \"$in\" && exit 0; fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "doc"]
# Fallback order: catdoc → wvText → antiword → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v catdoc >/dev/null 2>&1; then catdoc \"$in\" && exit 0; fi; \
if command -v wvText >/dev/null 2>&1; then wvText \"$in\" - && exit 0; fi; \
if command -v antiword>/dev/null 2>&1; then antiword -w 0 \"$in\" && exit 0; fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "xls"]
# Fallback order: in2csv → xlscat (-a UTF-8) → soffice --convert-to csv → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v in2csv >/dev/null 2>&1; then in2csv \"$in\" && exit 0; fi; \
if command -v xlscat >/dev/null 2>&1; then xlscat -a UTF-8 \"$in\" && exit 0; fi; \
if command -v soffice >/dev/null 2>&1; then tmpd=$(mktemp -d); \
if soffice --headless --convert-to csv --outdir \"$tmpd\" \"$in\" 1>/dev/null; then cat \"$tmpd\"/*.csv; rm -rf \"$tmpd\"; exit 0; fi; \
rm -rf \"$tmpd\"; \
fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "ppt"]
# Fallback order: catppt → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v catppt >/dev/null 2>&1; then catppt \"$in\" && exit 0; fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "docx"]
# Fallback order: pandoc → docx2txt.pl → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v pandoc >/dev/null 2>&1; then pandoc --standalone --from=docx --to=plain \"$in\" && exit 0; fi; \
if command -v docx2txt.pl >/dev/null 2>&1; then docx2txt.pl \"$in\" - && exit 0; fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "xlsx"]
# Fallback order: xlsx2csv → xlscat → in2csv → soffice --convert-to csv → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v xlsx2csv >/dev/null 2>&1; then xlsx2csv --all --ignoreempty --delimiter x09 --outputencoding utf-8 \"$in\" && exit 0; fi; \
if command -v xlscat >/dev/null 2>&1; then xlscat --trim -S all \"$in\" && exit 0; fi; \
if command -v in2csv >/dev/null 2>&1; then in2csv \"$in\" && exit 0; fi; \
if command -v soffice >/dev/null 2>&1; then tmpd=$(mktemp -d); \
if soffice --headless --convert-to csv --outdir \"$tmpd\" \"$in\" 1>/dev/null; then cat \"$tmpd\"/*.csv; rm -rf \"$tmpd\"; exit 0; fi; \
rm -rf \"$tmpd\"; \
fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "pptx"]
# Fallback order: pptx2md → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v pptx2md >/dev/null 2>&1; then out=$(mktemp); \
if pptx2md --disable-image --disable-wmf --disable-escaping \"$in\" -o \"$out\" 1>/dev/null; then cat \"$out\"; rm -f \"$out\"; exit 0; fi; \
rm -f \"$out\"; \
fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "rtf"]
# Fallback order: unrtf → pandoc → soffice --cat → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v unrtf >/dev/null 2>&1; then unrtf --text \"$in\" && exit 0; fi; \
if command -v pandoc >/dev/null 2>&1; then pandoc --standalone --from=rtf --to=plain \"$in\" && exit 0; fi; \
if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "epub"]
# Fallback order: pandoc → tika → cat
textconv = "f(){ in=\"$1\"; \
if command -v pandoc >/dev/null 2>&1; then pandoc --standalone --from=epub --to=plain \"$in\" && exit 0; fi; \
if command -v tika >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
cat \"$in\"; \
}; f"
binary = true
cachetextconv = true
[diff "tika"]
textconv = "f(){ if command -v tika >/dev/null 2>&1; then tika --text \"$1\"; else cat \"$1\"; fi; }; f"
binary = true
cachetextconv = true
[diff "libreoffice"]
textconv = "soffice --cat"
binary = true
cachetextconv = trueand add to ~/.config/git/attributes the lines
*.pdf diff=pdf
*.djvu diff=djvu
*.odt diff=odt
*.odp diff=libreoffice
*.ods diff=libreoffice
*.doc diff=doc
*.xls diff=xls
*.ppt diff=ppt
*.docx diff=docx
*.xlsx diff=xlsx
*.pptx diff=pptx
*.rtf diff=rtf
*.epub diff=pandoc
*.chm diff=tika
*.mhtml? diff=tika
*.{class,jar} diff=tika
*.{rar,7z,zip} diff=tikaLibreOffice is an office suite that (together with a common text browser such as lynx) can handle all those formats listed above, except PDFs.
(To use it on Microsoft Windows, ensure after its installation that its path is added to the %PATH% environment variable, say by Rapidee.)
Tika which is a content extractor that can handle all those formats listed above and many more. To use it:
-
Download the latest runnable
tika-app-...jarfrom Tika to~/bin/tika.jar(on Linux) respectively%USERPROFILE%\bin(on Microsoft Windows). -
Create
- on Linux, a shell script
~/bin/tikathat reads
#!/bin/sh exec java -Dfile.encoding=UTF-8 -jar "$HOME/bin/tika.jar" "$@" 2>/dev/null
and mark it executable (by
chmod a+x ~/bin/tika).- on Microsoft Windows, a batch script
%USERPROFILE%\bin\tika.batthat reads
@echo off java -Dfile.encoding=UTF-8 -jar "%USERPROFILE%\bin\tika.jar" %*
- on Linux, a shell script
-
Add the folder of the newly created
tikaexecutable to your environment variable$PATH(on Linux) respectively%PATH%(on Microsoft Windows):- on Linux, if you use
bashorzshby adding to~/.profileor~/.zshenvthe line
PATH=$PATH:~/bin
- on Microsoft Windows, a convenient program to update
%PATH%is Rapidee.
- on Linux, if you use