Skip to content

Instantly share code, notes, and snippets.

@Konfekt
Last active October 28, 2025 11:14
Show Gist options
  • Select an option

  • Save Konfekt/5ece511a94a8aa118aadbbb23dab1f21 to your computer and use it in GitHub Desktop.

Select an option

Save Konfekt/5ece511a94a8aa118aadbbb23dab1f21 to your computer and use it in GitHub Desktop.
git diff common binary files

To diff common binary files in git using appropriate external converters such as unrtf, pandoc, docx2txt.pl, odt2txt, git-xlsx-textconv, git-xlsx-textconv.pl or pptx2md, add to ~/.config/git/config the lines

[diff]
    [diff "pdf"]
      binary = true
      textconv = "f(){ if command -v pdftotext >/dev/null 2>&1; then pdftotext -layout -enc UTF-8 -nopgbrk -q \"$1\" -; else cat \"$1\"; fi; }; f"
      # textconv = pdfinfo
      cachetextconv = true

    [diff "djvu"]
      binary = true
      textconv = "f(){ if command -v djvutxt >/dev/null 2>&1; then djvutxt \"$1\"; else cat \"$1\"; fi; }; f"
      # textconv = pdfinfo
      cachetextconv = true

    [diff "odt"]
      # Fallback order: odt2txt → pandoc → soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v odt2txt >/dev/null 2>&1; then odt2txt \"$in\" && exit 0; fi; \
        if command -v pandoc  >/dev/null 2>&1; then pandoc --standalone --from=odt  --to=plain \"$in\" && exit 0; fi; \
        if command -v soffice  >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika    >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true

    [diff "doc"]
      # Fallback order: catdoc → wvText → antiword →  soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v catdoc  >/dev/null 2>&1; then catdoc \"$in\" && exit 0; fi; \
        if command -v wvText  >/dev/null 2>&1; then wvText \"$in\" - && exit 0; fi; \
        if command -v antiword>/dev/null 2>&1; then antiword -w 0 \"$in\" && exit 0; fi; \
        if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika    >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true
    [diff "xls"]
      # Fallback order: in2csv → xlscat (-a UTF-8) → soffice --convert-to csv → soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v in2csv  >/dev/null 2>&1; then in2csv \"$in\" && exit 0; fi; \
        if command -v xlscat  >/dev/null 2>&1; then xlscat -a UTF-8 \"$in\" && exit 0; fi; \
        if command -v soffice >/dev/null 2>&1; then tmpd=$(mktemp -d); \
          if soffice --headless --convert-to csv --outdir \"$tmpd\" \"$in\" 1>/dev/null; then cat \"$tmpd\"/*.csv; rm -rf \"$tmpd\"; exit 0; fi; \
          rm -rf \"$tmpd\"; \
        fi; \
        if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika    >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true
    [diff "ppt"]
      # Fallback order: catppt → soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v catppt  >/dev/null 2>&1; then catppt \"$in\" && exit 0; fi; \
        if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika    >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
        }; f"
      binary = true
      cachetextconv = true

    [diff "docx"]
      # Fallback order: pandoc → docx2txt.pl → soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v pandoc      >/dev/null 2>&1; then pandoc --standalone --from=docx --to=plain \"$in\" && exit 0; fi; \
        if command -v docx2txt.pl >/dev/null 2>&1; then docx2txt.pl \"$in\" - && exit 0; fi; \
        if command -v soffice     >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika        >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true
    [diff "xlsx"]
      # Fallback order: xlsx2csv → xlscat → in2csv → soffice --convert-to csv → soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v xlsx2csv >/dev/null 2>&1; then xlsx2csv --all --ignoreempty --delimiter x09 --outputencoding utf-8 \"$in\" && exit 0; fi; \
        if command -v xlscat   >/dev/null 2>&1; then xlscat --trim -S all \"$in\" && exit 0; fi; \
        if command -v in2csv   >/dev/null 2>&1; then in2csv \"$in\" && exit 0; fi; \
        if command -v soffice  >/dev/null 2>&1; then tmpd=$(mktemp -d); \
          if soffice --headless --convert-to csv --outdir \"$tmpd\" \"$in\" 1>/dev/null; then cat \"$tmpd\"/*.csv; rm -rf \"$tmpd\"; exit 0; fi; \
          rm -rf \"$tmpd\"; \
        fi; \
        if command -v soffice  >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika     >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true
    [diff "pptx"]
      # Fallback order: pptx2md → soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v pptx2md >/dev/null 2>&1; then out=$(mktemp); \
          if pptx2md --disable-image --disable-wmf --disable-escaping \"$in\" -o \"$out\" 1>/dev/null; then cat \"$out\"; rm -f \"$out\"; exit 0; fi; \
          rm -f \"$out\"; \
        fi; \
        if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika    >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true

    [diff "rtf"]
      # Fallback order: unrtf → pandoc → soffice --cat → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v unrtf   >/dev/null 2>&1; then unrtf --text \"$in\" && exit 0; fi; \
        if command -v pandoc  >/dev/null 2>&1; then pandoc --standalone --from=rtf --to=plain \"$in\" && exit 0; fi; \
        if command -v soffice >/dev/null 2>&1; then soffice --cat \"$in\" && exit 0; fi; \
        if command -v tika    >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true

    [diff "epub"]
      # Fallback order: pandoc → tika → cat
      textconv = "f(){ in=\"$1\"; \
        if command -v pandoc >/dev/null 2>&1; then pandoc --standalone --from=epub --to=plain \"$in\" && exit 0; fi; \
        if command -v tika   >/dev/null 2>&1; then tika --text \"$in\" && exit 0; fi; \
        cat \"$in\"; \
      }; f"
      binary = true
      cachetextconv = true

    [diff "tika"]
      textconv = "f(){ if command -v tika >/dev/null 2>&1; then tika --text \"$1\"; else cat \"$1\"; fi; }; f"
      binary = true
      cachetextconv = true
    [diff "libreoffice"]
      textconv = "soffice --cat"
      binary = true
      cachetextconv = true

and add to ~/.config/git/attributes the lines

*.pdf    diff=pdf
*.djvu   diff=djvu

*.odt    diff=odt
*.odp    diff=libreoffice
*.ods    diff=libreoffice

*.doc    diff=doc
*.xls    diff=xls
*.ppt    diff=ppt

*.docx   diff=docx
*.xlsx   diff=xlsx
*.pptx   diff=pptx

*.rtf    diff=rtf

*.epub   diff=pandoc
*.chm    diff=tika
*.mhtml? diff=tika

*.{class,jar}  diff=tika
*.{rar,7z,zip} diff=tika

LibreOffice is an office suite that (together with a common text browser such as lynx) can handle all those formats listed above, except PDFs. (To use it on Microsoft Windows, ensure after its installation that its path is added to the %PATH% environment variable, say by Rapidee.)

Tika which is a content extractor that can handle all those formats listed above and many more. To use it:

  1. Download the latest runnable tika-app-...jar from Tika to ~/bin/tika.jar (on Linux) respectively %USERPROFILE%\bin (on Microsoft Windows).

  2. Create

    • on Linux, a shell script ~/bin/tika that reads
        #!/bin/sh
        exec java -Dfile.encoding=UTF-8 -jar "$HOME/bin/tika.jar" "$@" 2>/dev/null

    and mark it executable (by chmod a+x ~/bin/tika).

    • on Microsoft Windows, a batch script %USERPROFILE%\bin\tika.bat that reads
        @echo off
        java -Dfile.encoding=UTF-8 -jar "%USERPROFILE%\bin\tika.jar" %*
  3. Add the folder of the newly created tika executable to your environment variable $PATH (on Linux) respectively %PATH% (on Microsoft Windows):

    • on Linux, if you use bash or zsh by adding to ~/.profile or ~/.zshenv the line
        PATH=$PATH:~/bin
    • on Microsoft Windows, a convenient program to update %PATH% is Rapidee.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment