Skip to content

Instantly share code, notes, and snippets.

@darcyparker
Last active November 25, 2020 20:05
Show Gist options
  • Save darcyparker/016930578862a407e0f0 to your computer and use it in GitHub Desktop.
Save darcyparker/016930578862a407e0f0 to your computer and use it in GitHub Desktop.
Search PDFs from command line
#!/usr/bin/env bash
#Search pdfs: a wrapper for pdfgrep and pdftotext
#http://github.com/darcyparker
_usage(){
local _filename=${0##*/}
cat << END_USAGE
Usage: $_filename
Search current directory:
$_filename <search pattern>
Search a specific directory:
$_filename <directory> <search pattern>
END_USAGE
exit 2
}
_requirements(){
cat << END_REQUIREMENTS
Requirements:
This requires pdfgrep or (pdftotext and grep)
pdfgrep is preferred because it reports page numbers accurately
Tip: On linux "apt-get install pdfgrep"
Tip: On linux "pdftotext" is a part of poppler-utils or xpdf-utils
: "apt-get install poppler-utils"
: "apt-get install xpdf-utils"
END_REQUIREMENTS
exit 1
}
_searchWithPdfToText(){
find "$1" -name "*.pdf" -type f -print0 | xargs -I{} -0 /usr/bin/env bash -c "pdftotext '{}' - | grep --with-filename --label='{}' --color \"$2\""
exit $?
}
_searchWithPdfgrep(){
#find "$1" -name "*.pdf" -type f -print0 | xargs -I{} -0 pdfgrep -H -n "$2" '{}'
pdfgrep -H -n -r "$2" "$1"
exit $?
}
_search(){
#First try pdfgrep
[ -x "$(command -v pdfgrep)" ] && _searchWithPdfgrep "$@"
#if there is no pdfgrep, use pdftotext
[ -x "$(command -v pdftotext)" ] && [ -x "$(command -v grep)" ] && _searchWithPdfToText "$@"
#if pdfgrep or pdftotext and grep are not available, then show requirements
_requirements
}
_main(){
[ $# -eq 0 ] || [ $# -gt 2 ] && _usage "$0" $#
[ $# -eq 1 ] && local _searchdir=$PWD && local _searchpattern=$1
[ $# -eq 2 ] && local _searchdir=$1 && local _searchpattern=$2
_search "$_searchdir" "$_searchpattern"
}
_main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment